def test_from_instances_exclusive_embeddings_file_inside_archive(self): """ Just for ensuring there are no problems when reading pretrained tokens from an archive """ # Read embeddings file from archive archive_path = str(self.TEST_DIR / "embeddings-archive.zip") with zipfile.ZipFile(archive_path, 'w') as archive: file_path = 'embedding.3d.vec' with archive.open(file_path, 'w') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) with archive.open('dummy.vec', 'w') as dummy_file: dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8')) embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' in words assert 'c' not in words
def test_multilabel_field_empty_field_works(self): vocab = Vocabulary() vocab.add_token_to_namespace("label1", namespace="test_empty_labels") vocab.add_token_to_namespace("label2", namespace="test_empty_labels") f = MultiLabelField([], label_namespace="test_empty_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
def test_from_dataset_respects_max_vocab_size_single_int(self): max_vocab_size = 1 vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == max_vocab_size + 2 vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert len(words) == 5
def test_unknown_token(self): # pylint: disable=protected-access # We're putting this behavior in a test so that the behavior is documented. There is # solver code that depends in a small way on how we treat the unknown token, so any # breaking change to this behavior should break a test, so you know you've done something # that needs more consideration. vocab = Vocabulary() oov_token = vocab._oov_token oov_index = vocab.get_token_index(oov_token) assert oov_index == 1 assert vocab.get_token_index("unseen word") == oov_index
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def test_from_dataset_respects_min_count(self): vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def test_vocab_can_print(self): vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") vocab.add_token_to_namespace("b3", namespace="b") print(vocab)
def __init__(self, vocab: Vocabulary, sentence_embedder: TextFieldEmbedder, action_embedding_dim: int, encoder: Seq2SeqEncoder, dropout: float = 0.0, rule_namespace: str = 'rule_labels') -> None: super(NlvrSemanticParser, self).__init__(vocab=vocab) self._sentence_embedder = sentence_embedder self._denotation_accuracy = Average() self._consistency = Average() self._encoder = encoder if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._rule_namespace = rule_namespace self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size( self._rule_namespace), embedding_dim=action_embedding_dim) # This is what we pass as input in the first step of decoding, when we don't have a # previous action. self._first_action_embedding = torch.nn.Parameter( torch.FloatTensor(action_embedding_dim)) torch.nn.init.normal_(self._first_action_embedding)
def index(self, vocab: Vocabulary): if self._indexed_labels is None: self._indexed_labels = [ vocab.get_token_index(label, self._label_namespace) # type: ignore for label in self.labels ]
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: dep_labels = [token.dep_ or 'NONE' for token in tokens] return { index_name: [ vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels ] }
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: tags = [ 'NONE' if token.ent_type_ is None else token.ent_type_ for token in tokens ] return { index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags] }
def test_max_vocab_size_dict(self): params = Params({ "max_vocab_size": { "tokens": 1, "characters": 20 } }) vocab = Vocabulary.from_params(params=params, instances=self.dataset) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == 3
def test_set_from_file_reads_non_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR / 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('B-PERS\n') vocab_file.write('I-PERS\n') vocab_file.write('O\n') vocab_file.write('B-ORG\n') vocab_file.write('I-ORG\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags') assert vocab.get_token_index("B-PERS", namespace='tags') == 0 assert vocab.get_token_index("I-PERS", namespace='tags') == 1 assert vocab.get_token_index("O", namespace='tags') == 2 assert vocab.get_token_index("B-ORG", namespace='tags') == 3 assert vocab.get_token_index("I-ORG", namespace='tags') == 4 assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS" assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS" assert vocab.get_token_from_index(2, namespace='tags') == "O" assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG" assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
def test_from_dataset_respects_inclusive_embedding_file(self): embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def test_max_vocab_size_partial_dict(self): indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()} instance = Instance({ 'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers) }) dataset = Batch([instance]) params = Params({ "max_vocab_size": { "tokens": 1 } }) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
def test_multilabel_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("rel0", namespace="rel_labels") vocab.add_token_to_namespace("rel1", namespace="rel_labels") vocab.add_token_to_namespace("rel2", namespace="rel_labels") f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels") f.index(vocab) tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
def test_registrability(self): @Vocabulary.register('my-vocabulary') class MyVocabulary: @classmethod def from_params(cls, params, instances=None): # pylint: disable=unused-argument return MyVocabulary() params = Params({'type': 'my-vocabulary'}) instance = Instance(fields={}) vocab = Vocabulary.from_params(params=params, instances=[instance]) assert isinstance(vocab, MyVocabulary)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: indices: List[int] = [] for token in tokens: if getattr(token, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. indices.append(token.text_id) else: text = token.text if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index(text, self.namespace)) return {index_name: indices}
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory_path`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"directory_path": vocab_dir, "extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory_path` key must be present in params, # or else there is nothing to extend from. params = Params({"extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[List[int]]]: indices: List[List[int]] = [] for token in tokens: token_indices: List[int] = [] if token.text is None: raise ConfigurationError( 'TokenCharactersIndexer needs a tokenizer that retains text' ) for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) token_indices.append(index) indices.append(token_indices) return {index_name: indices}
def __init__(self, vocabulary: Vocabulary, tag_namespace: str = "tags", ignore_classes: List[str] = None, label_encoding: str = "BIO") -> None: """ Parameters ---------- vocabulary : ``Vocabulary``, required. A vocabulary containing the tag namespace. tag_namespace : str, required. This metric assumes that a BIO format is used in which the labels are of the format: ["B-LABEL", "I-LABEL"]. ignore_classes : List[str], optional. Span labels which will be ignored when computing span metrics. A "span label" is the part that comes after the BIO label, so it would be "ARG1" for the tag "B-ARG1". For example by passing: ``ignore_classes=["V"]`` the following sequence would not consider the "V" span at index (2, 3) when computing the precision, recall and F1 metrics. ["O", "O", "B-V", "I-V", "B-ARG1", "I-ARG1"] This is helpful for instance, to avoid computing metrics for "V" spans in a BIO tagging scheme which are typically not included. label_encoding : ``str``, optional (default = "BIO") The encoding used to specify label span endpoints in the sequence. Valid options are "BIO", "IOB1", or BIOUL". """ if label_encoding not in ["BIO", "IOB1", "BIOUL"]: raise ConfigurationError("Unknown label encoding - expected 'BIO', 'IOB1', 'BIOUL'.") self._label_encoding = label_encoding self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(tag_namespace) self._ignore_classes: List[str] = ignore_classes or [] # These will hold per label span counts. self._true_positives: Dict[str, int] = defaultdict(int) self._false_positives: Dict[str, int] = defaultdict(int) self._false_negatives: Dict[str, int] = defaultdict(int)
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def index(self, vocab: Vocabulary): if self.is_global_rule and self._rule_id is None: self._rule_id = vocab.get_token_index(self.rule, self._vocab_namespace)
def index(self, vocab: Vocabulary): if self._label_id is None: self._label_id = vocab.get_token_index(self.label, self._label_namespace) # type: ignore
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory_path' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def test_from_params_adds_tokens_to_vocab(self): vocab = Vocabulary.from_params(Params({'tokens_to_add': {'tokens': ['q', 'x', 'z']}}), self.dataset) assert vocab.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b', 5: 'q', 6: 'x', 7: 'z'}
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' extension_ways = ["from_params", "extend_from_instances"] # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("d", namespace="tokens") original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.add_token_to_namespace("b", namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index("c", "tokens") # should be present assert extended_vocab.get_token_index("e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2 text_field = TextField([Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": []}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"]}) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"]}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token