コード例 #1
0
 def test_idx2token_cries_for_vocab(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(instances=single_instance,
                           max_num_tokens=MAX_NUM_WORDS)
     with pytest.raises(ValueError):
         vocab_builder.get_idx_from_token(1)
コード例 #2
0
    def set_char_vocab(self):
        self.char_instances = self.char_tokenizer.tokenize_batch(self.lines)

        self.char_vocab = Vocab(
            instances=self.char_instances,
            max_num_tokens=1e6,
            min_count=1,
            store_location=self.char_vocab_store_location,
            embedding_type=self.char_embedding_type,
            embedding_dimension=self.char_embedding_dimension,
            start_token=self.char_start_token,
            end_token=self.char_end_token,
            unk_token=self.char_unk_token,
            pad_token=self.char_pad_token,
        )
        self.char_vocab.build_vocab()

        # adding these to help conversion to characters later
        self.char_vocab.add_tokens(
            list(self.word_start_token)
            + list(self.word_end_token)
            + list(self.word_unk_token)
            + list(self.word_pad_token)
        )
        self.char_numericalizer = Numericalizer(vocabulary=self.char_vocab)
        self.char_vocab.print_stats()
コード例 #3
0
 def set_word_vocab(self):
     if not all(
         [
             attribute in dir(self)
             for attribute in self.word_vocab_required_attributes
         ]
     ):
         raise ValueError(
             f"For building word vocab, "
             f"please pass these attributes in your "
             f"dataset construction {self.word_vocab_required_attributes}"
         )
     self.word_instances = self.word_tokenizer.tokenize_batch(self.lines)
     self.word_vocab = Vocab(
         instances=self.word_instances,
         max_num_tokens=self.max_num_words,
         unk_token=self.word_unk_token,
         pad_token=self.word_pad_token,
         start_token=self.word_start_token,
         end_token=self.word_end_token,
         store_location=self.word_vocab_store_location,
         embedding_type=self.word_embedding_type,
         embedding_dimension=self.word_embedding_dimension,
     )
     self.word_numericalizer = Numericalizer(self.word_vocab)
     self.word_vocab.build_vocab()
     self.word_vocab.print_stats()
コード例 #4
0
    def test_build_vocab_single_instance_has_words(self, instances):
        single_instance = instances["single_instance"]
        vocab_builder = Vocab(instances=single_instance, max_num_tokens=1000)
        vocab = vocab_builder.map_tokens_to_freq_idx()

        assert "i" in vocab.keys()
        assert "like" in vocab.keys()
        assert "nlp" in vocab.keys()
コード例 #5
0
 def test_orig_vocab_len(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 0
     vocab_builder = Vocab(instances=single_instance,
                           max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     vocab_len = vocab_builder.get_orig_vocab_len()
     assert vocab_len == 3 + len(vocab_builder.special_vocab)
コード例 #6
0
    def test_get_topn(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        words_freqs = vocab_builder.get_topn_frequent_words(n=1)

        assert words_freqs[0][0] == "i"
        assert words_freqs[0][1] == 3
コード例 #7
0
 def test_max_num_tokens_unset(self, instances, include_special_vocab):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = None
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         include_special_vocab=include_special_vocab,
     )
     vocab.build_vocab()
     assert vocab.max_num_tokens == 3 + len(vocab.special_vocab.keys())
コード例 #8
0
def setup_lstm2seqdecoder(request, ):
    HIDDEN_DIM = 1024
    NUM_LAYERS = request.param[0]
    BIDIRECTIONAL = request.param[1]
    TEACHER_FORCING_RATIO = request.param[3]
    MAX_LENGTH = 5

    lines = []
    words = []
    # texts = ["First", "second", "Third"]
    texts = ["First sentence", "second sentence", "Third long sentence here"]
    for text in texts:
        line = Line(text=text)
        word = Line(text=text.split()[0])
        lines.append(line)
        words.append(word)
    flat_texts = [[word for sentence in texts for word in sentence]]
    vocab = Vocab(flat_texts)
    vocab.build_vocab()

    num_direction = 2 if BIDIRECTIONAL else 1
    h0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.1
    c0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.2

    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder_outputs = (torch.ones(len(texts), 5, num_direction * HIDDEN_DIM) *
                       0.5 if request.param[2] else None)
    decoder = Lstm2SeqDecoder(
        embedder=embedder,
        vocab=vocab,
        max_length=MAX_LENGTH,
        attn_module=request.param[2],
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        rnn_bias=False,
        num_layers=NUM_LAYERS,
    )

    return (
        decoder,
        {
            "HIDDEN_DIM": HIDDEN_DIM,
            "NUM_LAYERS": NUM_LAYERS,
            "MAX_LENGTH": MAX_LENGTH,
            "TEACHER_FORCING_RATIO": TEACHER_FORCING_RATIO,
            "LINES": lines,
            "WORDS": words,
            "VOCAB_SIZE": vocab.get_vocab_len(),
            "BIDIRECTIONAL": BIDIRECTIONAL,
        },
        encoder_outputs,
        (h0, c0),
    )
コード例 #9
0
    def test_vocab_always_has_special_tokens(self, instances):
        single_instance = instances["single_instance"]
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=1000,
                              min_count=1)

        vocab = vocab_builder.map_tokens_to_freq_idx()
        assert vocab_builder.unk_token in vocab.keys()
        assert vocab_builder.pad_token in vocab.keys()
        assert vocab_builder.start_token in vocab.keys()
        assert vocab_builder.end_token in vocab.keys()
コード例 #10
0
    def test_vocab_length_min_freq_1_max_words_1(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 1
        MIN_FREQ = 1

        vocab_builder = Vocab(instances=single_instance,
                              min_count=MIN_FREQ,
                              max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        len_vocab = vocab_builder.get_vocab_len()
        assert len_vocab == 1 + len(vocab_builder.special_vocab)
コード例 #11
0
    def test_build_vocab_single_instance_min_freq_2(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        MIN_FREQ = 2
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=MAX_NUM_WORDS,
                              min_count=MIN_FREQ)
        vocab = vocab_builder.build_vocab()

        vocab_len = len(set(idx for freq, idx in vocab.values()))

        assert vocab_len == 2 + len(vocab_builder.special_vocab)
コード例 #12
0
 def test_random_embeddinng_has_2dimensions(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         embedding_type=None,
         embedding_dimension=300,
     )
     vocab.build_vocab()
     embeddings = vocab.load_embedding()
     assert embeddings.ndimension() == 2
コード例 #13
0
    def test_get_topn(self, instances, include_special_vocab):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = None
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=MAX_NUM_WORDS,
            include_special_vocab=include_special_vocab,
        )
        vocab_builder.build_vocab()
        words_freqs = vocab_builder.get_topn_frequent_words(n=1)

        assert words_freqs[0][0] == "i"
        assert words_freqs[0][1] == 3
コード例 #14
0
 def test_disp_sentences_from_indices(
     self, instances, tmpdir, include_special_vocab
 ):
     instance_dict = instances
     single_instance = instance_dict["single_instance"]
     MAX_NUM_WORDS = None
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         include_special_vocab=include_special_vocab,
     )
     vocab.build_vocab()
     sent = vocab.get_disp_sentence_from_indices([0, 1, 2])
     assert type(sent) is str
コード例 #15
0
 def test_disp_sentences_from_indices(self, instances, tmpdir):
     instance_dict = instances
     single_instance = instance_dict["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         embedding_type=None,
         embedding_dimension=300,
         store_location=vocab_file,
     )
     vocab.build_vocab()
     sent = vocab.get_disp_sentence_from_indices([0, 1, 2, 3])
     assert type(sent) is str
コード例 #16
0
    def test_build_vocab_single_instance_descending_order(self, instances):
        single_instance = instances["single_instance"]
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=1000,
                              min_count=1)
        vocab = vocab_builder.map_tokens_to_freq_idx()

        i_freq, i_idx = vocab["i"]
        like_freq, like_idx = vocab["like"]
        nlp_freq, nlp_idx = vocab["nlp"]

        # since 'i' appears more number of times than 'like' appears more
        # number of times than nlp
        assert i_idx < like_idx < nlp_idx
        assert i_freq > like_freq > nlp_freq
コード例 #17
0
def get_numericalized_instances(get_preprocessed_instances):
    instances, labels = get_preprocessed_instances
    MAX_NUM_WORDS = 3000
    MAX_LENGTH = 15
    vocab = Vocab(instances=instances, max_num_tokens=MAX_NUM_WORDS)
    vocab.build_vocab()
    numericalizer = Numericalizer(vocabulary=vocab)
    numericalized_instances = numericalizer.numericalize_batch_instances(
        instances[:32])
    return {
        "numericalized_instances": numericalized_instances,
        "labels": labels,
        "max_length": MAX_LENGTH,
        "max_num_words": MAX_NUM_WORDS,
        "vocab": vocab,
    }
コード例 #18
0
 def test_token2idx(self, instances, start_token, end_token, unk_token, pad_token):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         start_token=start_token,
         end_token=end_token,
         pad_token=pad_token,
         unk_token=unk_token,
     )
     vocab_builder.build_vocab()
     token2idx = vocab_builder.token2idx
     len_indices = len(token2idx.keys())
     indices = token2idx.values()
     indices = sorted(indices)
     assert indices == list(range(len_indices))
コード例 #19
0
 def test_idx2token_for_unk(self, instances):
     """" Many words map to UNK in the vocab. For example say the index for UNK is 3.
     Then mapping 3 to the token should always map to UNK and not any other word
     """
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         start_token="<SOS>",
         end_token="<EOS>",
         pad_token="<PAD>",
         unk_token="<UNK>",
     )
     vocab_builder.build_vocab()
     UNK_IDX = vocab_builder.special_vocab[vocab_builder.unk_token][1]
     assert vocab_builder.get_token_from_idx(UNK_IDX) == "<UNK>"
コード例 #20
0
def single_instance_setup(instances):
    single_instance = instances["single_instance"]
    MAX_NUM_WORDS = 100

    vocabulary = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)

    numericalizer = Numericalizer(vocabulary=vocabulary)

    return single_instance, numericalizer, vocabulary
コード例 #21
0
 def test_idx2token_out_of_bounds(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     print(vocab_builder.get_idx2token_mapping())
     with pytest.raises(ValueError):
         vocab_builder.get_token_from_idx(100)
コード例 #22
0
    def test_idx2token(self, instances, start_token, end_token, unk_token, pad_token):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=MAX_NUM_WORDS,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token,
            unk_token=unk_token,
        )
        vocab_builder.build_vocab()
        idx2token = vocab_builder.idx2token
        len_idx2token = len(idx2token)
        indices = idx2token.keys()
        indices = sorted(indices)

        # tests all indices are in order
        assert indices == list(range(len_idx2token))
コード例 #23
0
    def test_single_instance_build_vocab(self, instances, include_special_vocab):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = None
        MIN_FREQ = 1
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=MAX_NUM_WORDS,
            min_count=MIN_FREQ,
            include_special_vocab=include_special_vocab,
        )

        vocab = vocab_builder.build_vocab()

        assert "i" in vocab.keys()
        assert "like" in vocab.keys()
        assert "nlp" in vocab.keys()

        vocab_len = len(set(idx for freq, idx in vocab.values()))

        assert vocab_len == 3 + len(vocab_builder.special_vocab)
コード例 #24
0
 def test_print_stats_works(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(instances=single_instance,
                           max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     vocab_builder.print_stats()
コード例 #25
0
    def test_load_vocab(self, instances, tmpdir):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
        vocab_builder.save_to_file(vocab_file)

        vocab = Vocab.load_from_file(filename=vocab_file)

        assert vocab.get_vocab_len() == 3 + len(vocab_builder.special_vocab)
コード例 #26
0
    def test_preprocessing_lower(self, instances):
        single_instance = instances["single_instance"]
        instance_preprocesing = InstancePreprocessing()
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=1000,
            preprocessing_pipeline=[instance_preprocesing.lowercase],
        )

        instances = vocab_builder.instances

        for instance in instances:
            for token in instance:
                assert token.islower()
コード例 #27
0
 def test_print_stats_works(self, instances, include_special_vocab):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = None
     vocab_builder = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         include_special_vocab=include_special_vocab,
     )
     vocab_builder.build_vocab()
     vocab_builder.print_stats()
コード例 #28
0
    def test_save_vocab(self, instances, tmpdir):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=MAX_NUM_WORDS)

        vocab_builder.build_vocab()
        vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
        vocab_builder.save_to_file(vocab_file)

        assert os.path.isfile(vocab_file)
コード例 #29
0
 def test_load_embedding_has_all_words(self, instances, embedding_type):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         embedding_type=embedding_type,
     )
     vocab.build_vocab()
     embedding = vocab.load_embedding()
     assert embedding.size(0) == vocab.get_vocab_len()
コード例 #30
0
    def test_single_instance_clip_on_max_num(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 1
        vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        vocab = vocab_builder.map_tokens_to_freq_idx()

        vocab = vocab_builder.clip_on_max_num(vocab)

        vocab_len = len(set(idx for freq, idx in vocab.values()))

        assert vocab_len == MAX_NUM_WORDS + len(vocab_builder.special_vocab)