def test_multilabel_field_empty_field_works(self): vocab = Vocabulary() vocab.add_token_to_namespace("label1", namespace="test_empty_labels") vocab.add_token_to_namespace("label2", namespace="test_empty_labels") f = MultiLabelField([], label_namespace="test_empty_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
def test_multilabel_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("rel0", namespace="rel_labels") vocab.add_token_to_namespace("rel1", namespace="rel_labels") vocab.add_token_to_namespace("rel2", namespace="rel_labels") f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": []}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"]}) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"]}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace('the') vocab.add_token_to_namespace('a') embedding_layer = get_pretrained_embedding_layer( 'tests/fixtures/glove.6B.300d.sample.txt.gz', vocab, projection_dim=20) input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def test_add_word_to_index_gives_consistent_results(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1
def test_pad_produces_one_hot_targets(self): vocab = Vocabulary() vocab.add_token_to_namespace("B", namespace='*tags') vocab.add_token_to_namespace("I", namespace='*tags') vocab.add_token_to_namespace("O", namespace='*tags') tags = ["B", "I", "O", "O", "O"] tag_field = TagField(tags, self.text, tag_namespace="*tags") tag_field.index(vocab) padding_lengths = tag_field.get_padding_lengths() array = tag_field.as_array(padding_lengths) numpy.testing.assert_array_almost_equal( array, numpy.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0, 1]]))
def test_index_converts_field_correctly(self): vocab = Vocabulary() b_index = vocab.add_token_to_namespace("B", namespace='*tags') i_index = vocab.add_token_to_namespace("I", namespace='*tags') o_index = vocab.add_token_to_namespace("O", namespace='*tags') tags = ["B", "I", "O", "O", "O"] tag_field = TagField(tags, self.text, tag_namespace="*tags") tag_field.index(vocab) # pylint: disable=protected-access assert tag_field._indexed_tags == [ b_index, i_index, o_index, o_index, o_index ] assert tag_field._num_tags == 3
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField( [Token(t) for t in ["a", "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")} ) text_field2 = TextField( [Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")} ) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: tokens1 is non-padded in original_vocab but not in instances params = Params( { "type": "extend", "directory": vocab_dir, "non_padded_namespaces": [], "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]}, } ) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params( { "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1"], "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]}, } ) Vocabulary.from_params(params, instances=instances) # Following 2 should give error: tokens2 is padded in instances but not in original_vocab params = Params( { "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens2"], "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]}, } ) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
def test_pad_produces_one_hot_targets(self): vocab = Vocabulary() vocab.add_token_to_namespace("B", namespace='*tags') vocab.add_token_to_namespace("I", namespace='*tags') vocab.add_token_to_namespace("O", namespace='*tags') text = TextField(["here", "are", "some", "words", "."], [token_indexers["single id"]("words")]) tags = ["B", "I", "O", "O", "O"] tag_field = TagField(tags, text, tag_namespace="*tags") tag_field.index(vocab) padding_lengths = tag_field.get_padding_lengths() array = tag_field.pad(padding_lengths) numpy.testing.assert_array_almost_equal( array, numpy.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0, 1]]))
def test_index_converts_field_correctly(self): vocab = Vocabulary() b_index = vocab.add_token_to_namespace("B", namespace='*tags') i_index = vocab.add_token_to_namespace("I", namespace='*tags') o_index = vocab.add_token_to_namespace("O", namespace='*tags') text = TextField(["here", "are", "some", "words", "."], [token_indexers["single id"]("words")]) tags = ["B", "I", "O", "O", "O"] tag_field = TagField(tags, text, tag_namespace="*tags") tag_field.index(vocab) # pylint: disable=protected-access assert tag_field._indexed_tags == [ b_index, i_index, o_index, o_index, o_index ] assert tag_field._num_tags == 3
def _get_vocab(words_by_freq, max_v_sizes, word_freq_thresh): """Build vocabulary by selecting the most frequent tokens""" vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes) words_by_freq = dict(words_by_freq) try: words_by_freq.pop("<unk>") # remove special token, TODO except KeyError: pass for special in SPECIALS: vocab.add_token_to_namespace(special, "tokens") for word, freq in list(words_by_freq.items())[: max_v_sizes["word"]]: if freq >= word_freq_thresh: vocab.add_token_to_namespace(word, "tokens") return vocab
def test_multilabel_field_empty_field_works(self): vocab = Vocabulary() vocab.add_token_to_namespace("label1", namespace="test_empty_labels") vocab.add_token_to_namespace("label2", namespace="test_empty_labels") f = MultiLabelField([], label_namespace="test_empty_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0])) g = f.empty_field() g.index(vocab) tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0])) h = MultiLabelField( [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True ) tensor = h.empty_field().as_tensor(None).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
def main(): infilename = 'test/fixtures/bioul_to_span.json' with open(infilename) as f: d = json.load(f) docs = d['tag'] vocab = Vocabulary() vocab.add_token_to_namespace( 'O', namespace='span_labels') # reserved label for no-entity for doc in docs: for label in doc: if label != 'O': span_label = label[ 2:] # drop the first two character because they are not useful for span labels vocab.add_token_to_namespace( span_label, namespace='span_labels' ) # TODO: is this the right namespace? # this function is expecting the vocab is already initialized with span labels batched_bioul_to_span_tesnors(docs, vocab)
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def test(): from pprint import pprint params = Params( {'token_embedder': { 'num_embeddings': 4, 'embedding_dim': 3 }}) vocab = Vocabulary() while True: vocab_size = vocab.get_vocab_size() if vocab_size == 4: break vocab.add_token_to_namespace('a' + str(vocab_size)) model = BaselineModel(params=params, vocab=vocab) premise = {'tokens': torch.randint(low=0, high=4, size=(5, 6))} hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 7))} label = torch.randint(low=0, high=3, size=(5, )) output = model(premise=premise, hypothesis=hypothesis, label=label) pprint(output) pprint(model.get_metrics())
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory_path' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'vocabulary_directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def get_vocab( word2freq: Dict[str, int], char2freq: Dict[str, int], max_v_sizes: Dict[str, int] ) -> Vocabulary: """Build vocabulary by selecting the most frequent tokens Parameters ---------- word2freq : Dict[str, int] Dict mapping words to frequencies. char2freq : Dict[str, int] Dict mapping chars to frequencies. max_v_sizes : dict[str: int] Dict used to set max vocab size for each token namespace. Returns ------- allennlp.data.Vocabulary vocab containing word and char namespaces. """ vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes) for special in SPECIALS: vocab.add_token_to_namespace(special, "tokens") words_by_freq = [(word, freq) for word, freq in word2freq.items()] words_by_freq.sort(key=lambda x: x[1], reverse=True) for word, _ in words_by_freq[: max_v_sizes["word"]]: vocab.add_token_to_namespace(word, "tokens") chars_by_freq = [(char, freq) for char, freq in char2freq.items()] chars_by_freq.sort(key=lambda x: x[1], reverse=True) for char, _ in chars_by_freq[: max_v_sizes["char"]]: vocab.add_token_to_namespace(char, "chars") return vocab
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField(["A", "sentence"], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField(["A", "sentence"], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField(["A", "sentence"], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"type": "extend", "directory": vocab_dir}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory` key must be present in params, # or else there is nothing to extend from. params = Params({"type": "extend"}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory_path`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"directory_path": vocab_dir, "extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory_path` key must be present in params, # or else there is nothing to extend from. params = Params({"extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances)
def test_unlabeled(): from pprint import pprint params = Params({ 'token_embedder': { 'num_embeddings': 4, 'embedding_dim': 300 }, 'code_dist_type': 'gaussian' }) vocab = Vocabulary() while True: vocab_size = vocab.get_vocab_size() if vocab_size == 4: break vocab.add_token_to_namespace('a' + str(vocab_size)) model = DeconvSNLIModel(params=params, vocab=vocab) premise = {'tokens': torch.randint(low=0, high=4, size=(5, 29))} hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 29))} output = model(premise=premise, hypothesis=hypothesis, label=None) pprint(output) pprint(model.get_metrics())
def test_vocab_can_print(self): vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") vocab.add_token_to_namespace("b3", namespace="b") print(vocab)
def _build_vocabulary(self, summaries: List[List[str]]): vocab = Vocabulary() vocab.add_token_to_namespace(START_SYMBOL) vocab.add_token_to_namespace(END_SYMBOL) for summary in summaries: for sentence in summary: for token in sentence.split(): vocab.add_token_to_namespace(token) return vocab
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = self.TEST_DIR / u'vocab_save' vocab = Vocabulary(non_padded_namespaces=[u"a", u"c"]) vocab.add_token_to_namespace( u"a0", namespace=u"a") # non-padded, should start at 0 vocab.add_token_to_namespace(u"a1", namespace=u"a") vocab.add_token_to_namespace(u"a2", namespace=u"a") vocab.add_token_to_namespace( u"b2", namespace=u"b") # padded, should start at 2 vocab.add_token_to_namespace(u"b3", namespace=u"b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == set([u"a", u"c"]) # Check namespace a. assert vocab2.get_vocab_size(namespace=u'a') == 3 assert vocab2.get_token_from_index(0, namespace=u'a') == u'a0' assert vocab2.get_token_from_index(1, namespace=u'a') == u'a1' assert vocab2.get_token_from_index(2, namespace=u'a') == u'a2' assert vocab2.get_token_index(u'a0', namespace=u'a') == 0 assert vocab2.get_token_index(u'a1', namespace=u'a') == 1 assert vocab2.get_token_index(u'a2', namespace=u'a') == 2 # Check namespace b. assert vocab2.get_vocab_size( namespace=u'b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index( 0, namespace=u'b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace=u'b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace=u'b') == u'b2' assert vocab2.get_token_from_index(3, namespace=u'b') == u'b3' assert vocab2.get_token_index(vocab._padding_token, namespace=u'b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace=u'b') == 1 assert vocab2.get_token_index(u'b2', namespace=u'b') == 2 assert vocab2.get_token_index(u'b3', namespace=u'b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary( u"a") == vocab2.get_index_to_token_vocabulary(u"a") assert vocab.get_index_to_token_vocabulary( u"b") == vocab2.get_index_to_token_vocabulary(u"b")
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 0
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="*labels") vocab.add_token_to_namespace("contradiction", namespace="*labels") vocab.add_token_to_namespace("neutral", namespace="*labels") label = LabelField("entailment") label.index(vocab) array = label.pad(label.get_padding_lengths()) numpy.testing.assert_array_almost_equal(array[0], numpy.array([1, 0, 0]))
def test_token_to_indices_produces_correct_characters(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') vocab.add_token_to_namespace("s", namespace='characters') vocab.add_token_to_namespace("e", namespace='characters') vocab.add_token_to_namespace("n", namespace='characters') vocab.add_token_to_namespace("t", namespace='characters') vocab.add_token_to_namespace("c", namespace='characters') indexer = TokenCharactersIndexer("characters") indices = indexer.token_to_indices("sentential", vocab) assert indices == [3, 4, 5, 6, 4, 5, 6, 1, 1, 1]
def test_multilabel_field_returns_correct_empty_sequence(self): vocab = Vocabulary() vocab.add_token_to_namespace("label1", namespace="test_empty_labels") vocab.add_token_to_namespace("label2", namespace="test_empty_labels") f = MultiLabelField([], label_namespace="test_empty_labels") f.empty_field() vocab = Vocabulary() vocab.add_token_to_namespace("rel0", namespace="rel_labels") vocab.add_token_to_namespace("rel1", namespace="rel_labels") vocab.add_token_to_namespace("rel2", namespace="rel_labels") f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() f.empty_field() numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
class TestTextField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["single id"]("words")]) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["characters"]("characters")]) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField(["This", "is", "a", "sentence", "."], token_indexers=[ token_indexers["single id"]("words"), token_indexers["characters"]("characters") ]) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"} def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace( "A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField( ["A", "sentence"], [token_indexers["single id"](token_namespace="words")]) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens == [[capital_a_index, sentence_index]] field1 = TextField( ["A", "sentence"], [token_indexers["characters"](character_namespace="characters")]) field1.index(vocab) assert field1._indexed_tokens == [[[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]]] field2 = TextField( ["A", "sentence"], token_indexers=[ token_indexers["single id"](token_namespace="words"), token_indexers["characters"](character_namespace="characters") ]) field2.index(vocab) assert field2._indexed_tokens == [[capital_a_index, sentence_index], [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]]] # pylint: enable=protected-access def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["single id"]("words")]) with pytest.raises(ConfigurationError): field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["single id"]("words")]) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5} field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["characters"]("characters")]) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} field = TextField(["This", "is", "a", "sentence", "."], token_indexers=[ token_indexers["characters"]("characters"), token_indexers["single id"]("words") ]) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} def test_pad_handles_words(self): field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["single id"]("words")]) field.index(self.vocab) padding_lengths = field.get_padding_lengths() arrays = field.pad(padding_lengths) numpy.testing.assert_array_almost_equal(arrays[0], numpy.array([1, 1, 1, 2, 1])) def test_pad_handles_longer_lengths(self): field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["single id"]("words")]) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 10 arrays = field.pad(padding_lengths) numpy.testing.assert_array_almost_equal( arrays[0], numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_pad_handles_characters(self): field = TextField( ["This", "is", "a", "sentence", "."], token_indexers=[token_indexers["characters"]("characters")]) field.index(self.vocab) padding_lengths = field.get_padding_lengths() arrays = field.pad(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(arrays[0], expected_character_array) def test_pad_handles_words_and_characters_with_longer_lengths(self): field = TextField(["a", "sentence", "."], token_indexers=[ token_indexers["single id"]("words"), token_indexers["characters"]("characters") ]) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 5 padding_lengths["num_token_characters"] = 10 arrays = field.pad(padding_lengths) numpy.testing.assert_array_almost_equal(arrays[0], numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal( arrays[1], numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
def test_from_params_valid_vocab_extension_thoroughly(self): """ Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana """ vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary( non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens0": SingleIdTokenIndexer("tokens0")}, ) text_field1 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens1": SingleIdTokenIndexer("tokens1")}, ) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([ Instance({ "text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5, }) ]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens5"], }) extended_vocab = Vocabulary.from_params(params, instances=instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == { "tokens1", "tokens3", "tokens5" } # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size( "tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size( "tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index( token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index( index, namespace) extended_vocab_token = extended_vocab.get_token_from_index( index, namespace) assert vocab_token == extended_vocab_token
class TestTransformerToolkit(AllenNlpTestCase): def setup_method(self): super().setup_method() self.vocab = Vocabulary() # populate vocab. self.vocab.add_token_to_namespace("word") self.vocab.add_token_to_namespace("the") self.vocab.add_token_to_namespace("an") def test_create_embedder_using_toolkit(self): embedding_file = str(self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz") class TinyTransformer(TokenEmbedder): def __init__(self, vocab, embedding_dim, hidden_size, intermediate_size): super().__init__() self.embeddings = Embedding( pretrained_file=embedding_file, embedding_dim=embedding_dim, projection_dim=hidden_size, vocab=vocab, ) self.transformer = TransformerStack( num_hidden_layers=4, hidden_size=hidden_size, intermediate_size=intermediate_size, ) @overrides def forward(self, token_ids: torch.LongTensor): x = self.embeddings(token_ids) x = self.transformer(x) return x tiny = TinyTransformer(self.vocab, embedding_dim=300, hidden_size=80, intermediate_size=40) tiny.forward(torch.LongTensor([[0, 1, 2]])) def test_use_first_four_layers_of_pretrained(self): pretrained = "bert-base-cased" class SmallTransformer(TokenEmbedder): def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( pretrained, relevant_module="bert.embeddings") self.transformer = TransformerStack.from_pretrained_module( pretrained, num_hidden_layers=4, relevant_module="bert.encoder", strict=False, ) @overrides def forward(self, token_ids: torch.LongTensor): x = self.embeddings(token_ids) x = self.transformer(x) return x small = SmallTransformer() assert len(small.transformer.layers) == 4 small(torch.LongTensor([[0, 1, 2]])) def test_use_selected_layers_of_bert_for_different_purposes(self): class MediumTransformer(torch.nn.Module): def __init__(self): super().__init__() self.embeddings = TransformerEmbeddings.from_pretrained_module( "bert-base-cased", relevant_module="bert.embeddings") self.separate_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=8, strict=False, ) self.combined_transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder", num_hidden_layers=4, mapping={ f"layer.{l}": f"layers.{i}" for (i, l) in enumerate(range(8, 12)) }, strict=False, ) @overrides def forward( self, left_token_ids: torch.LongTensor, right_token_ids: torch.LongTensor, ): left = self.embeddings(left_token_ids) left = self.separate_transformer(left) right = self.embeddings(right_token_ids) right = self.separate_transformer(right) # combine the sequences in some meaningful way. here, we just add them. # combined = combine_masked_sequences(left, left_mask, right, right_mask) combined = left + right return self.combined_transformer(combined) medium = MediumTransformer() assert (len(medium.separate_transformer.layers)) == 8 assert (len(medium.combined_transformer.layers)) == 4 pretrained = cached_transformers.get("bert-base-cased", False) pretrained_layers = dict(pretrained.encoder.layer.named_modules()) separate_layers = dict( medium.separate_transformer.layers.named_modules()) assert_allclose( separate_layers["0"].intermediate.dense.weight.data, pretrained_layers["0"].intermediate.dense.weight.data, ) combined_layers = dict( medium.combined_transformer.layers.named_modules()) assert_allclose( combined_layers["0"].intermediate.dense.weight.data, pretrained_layers["8"].intermediate.dense.weight.data, ) assert_allclose( combined_layers["1"].intermediate.dense.weight.data, pretrained_layers["9"].intermediate.dense.weight.data, ) assert_allclose( combined_layers["2"].intermediate.dense.weight.data, pretrained_layers["10"].intermediate.dense.weight.data, ) assert_allclose( combined_layers["3"].intermediate.dense.weight.data, pretrained_layers["11"].intermediate.dense.weight.data, ) def test_combination_of_two_different_berts(self): # Regular BERT, but with AlBERT's special compressed embedding scheme class AlmostRegularTransformer(TokenEmbedder): def __init__(self): super().__init__() self.embeddings = AutoModel.from_pretrained( "albert-base-v2").embeddings self.transformer = TransformerStack.from_pretrained_module( "bert-base-cased", relevant_module="bert.encoder") # We want to tune only the embeddings, because that's our experiment. self.transformer.requires_grad = False @overrides def forward(self, token_ids: torch.LongTensor, mask: torch.BoolTensor): x = self.embeddings(token_ids, mask) x = self.transformer(x) return x almost = AlmostRegularTransformer() assert len(almost.transformer.layers) == 12 assert isinstance(almost.embeddings, AlbertEmbeddings) @pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"]) def test_end_to_end(self, model_name: str): data = [ ("I'm against picketing", "but I don't know how to show it."), ("I saw a human pyramid once.", "It was very unnecessary."), ] tokenizer = cached_transformers.get_tokenizer(model_name) batch = tokenizer.batch_encode_plus(data, padding=True, return_tensors="pt") with torch.no_grad(): huggingface_model = cached_transformers.get( model_name, make_copy=False).eval() huggingface_output = huggingface_model(**batch) embeddings = TransformerEmbeddings.from_pretrained_module( model_name).eval() transformer_stack = TransformerStack.from_pretrained_module( model_name).eval() pooler = TransformerPooler.from_pretrained_module( model_name).eval() batch["attention_mask"] = batch["attention_mask"].to(torch.bool) output = embeddings(**batch) output = transformer_stack(output, batch["attention_mask"]) assert_allclose( output.final_hidden_states, huggingface_output.last_hidden_state, rtol=0.0001, atol=1e-4, ) output = pooler(output.final_hidden_states) assert_allclose(output, huggingface_output.pooler_output, rtol=0.0001, atol=1e-4)
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / "vocab_save" # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_tokens_to_namespace(["d", "a", "b"], namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) instances = Batch([Instance({"text": text_field})]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index( "c", "tokens") # should be present assert extended_vocab.get_token_index( "e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace( "a", namespace="tokens1") # index2 text_field = TextField( [Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) vocab_dir = self.TEST_DIR / "vocab_save" shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": non_padded_namespaces, }) extended_vocab = Vocabulary.from_params(params, instances=instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
class TestDataset(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", "words") self.vocab.add_token_to_namespace(".", "words") super(TestDataset, self).setUp() def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1))}) instance2 = Instance({"words": TextField(["hello"], [])}) with pytest.raises(ConfigurationError): _ = Dataset([instance1, instance2]) def test_padding_lengths_uses_max_instance_lengths(self): dataset = self.get_dataset() dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == { "text1": { "num_tokens": 5 }, "text2": { "num_tokens": 6 } } def test_as_arrays(self): dataset = self.get_dataset() dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() arrays = dataset.as_arrays(padding_lengths) text1 = arrays["text1"][0] text2 = arrays["text2"][0] numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]])) def get_dataset(self): field1 = TextField(["this", "is", "a", "sentence", "."], [token_indexers["single id"]("words")]) field2 = TextField(["this", "is", "a", "different", "sentence", "."], [token_indexers["single id"]("words")]) field3 = TextField(["here", "is", "a", "sentence", "."], [token_indexers["single id"]("words")]) field4 = TextField(["this", "is", "short"], [token_indexers["single id"]("words")]) instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ] return Dataset(instances)
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' extension_ways = ["from_params", "extend_from_instances"] # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("d", namespace="tokens") original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.add_token_to_namespace("b", namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index("c", "tokens") # should be present assert extended_vocab.get_token_index("e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2 text_field = TextField([Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count