def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory_path' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": []}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"]}) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"]}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory_path`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"directory_path": vocab_dir, "extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory_path` key must be present in params, # or else there is nothing to extend from. params = Params({"extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances)
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == {"a", "c"} # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' extension_ways = ["from_params", "extend_from_instances"] # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("d", namespace="tokens") original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.add_token_to_namespace("b", namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index("c", "tokens") # should be present assert extended_vocab.get_token_index("e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2 text_field = TextField([Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def test_from_params_valid_vocab_extension_thoroughly(self): """ Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana """ vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary( non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([ Instance({ "text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5 }) ]) params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"] }) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == { "tokens1", "tokens3", "tokens5" } # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size( "tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size( "tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index( token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index( index, namespace) extended_vocab_token = extended_vocab.get_token_from_index( index, namespace) assert vocab_token == extended_vocab_token
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' extension_ways = ["from_params", "extend_from_instances"] # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_tokens_to_namespace(["d", "a", "b"], namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params( {"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces }) extended_vocab = Vocabulary.from_params(params, instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index( "d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index( "a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index( "b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index( "c", "tokens") # should be present assert extended_vocab.get_token_index( "e", "tokens") # should be present assert extended_vocab.get_vocab_size( "tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary( non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace( "a", namespace="tokens1") # index2 text_field = TextField( [Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params( {"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces }) extended_vocab = Vocabulary.from_params(params, instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size( "tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size( "tokens2") == 1 + extra_count
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a", "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch( [Instance({ "text1": text_field1, "text2": text_field2 })]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": [] }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={ "tokens1": ["a"], "tokens2": ["p"] }) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"] }) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={ "tokens1": ["a"], "tokens2": ["p"] }) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"] }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend( non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={ "tokens1": ["a"], "tokens2": ["p"] })
# loader = IrTupleDatasetReader(lazy=True,lowercase=args.lowercase) def getInstances(): for file in args.dataset_files: instances = loader.read(file) for i in instances: yield Instance({"text":i["target_tokens"]}) namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(getInstances()): instance.count_vocab_items(namespace_token_counts) #with open(args.out_dir,"w",encoding="utf8") as out: # for n in namespace_token_counts: # #out.write("--"+n+"\n") # for w,i in namespace_token_counts[n].items(): # out.write(w+"\t"+str(i)+"\n") vocab = Vocabulary(namespace_token_counts, min_count={"tokens":100}) vocab.save_to_files(args.out_dir) #vocab = Vocabulary(namespace_token_counts, min_count={"tokens":50}) #vocab.save_to_files(args.out_dir2) #vocab = Vocabulary(namespace_token_counts, min_count={"tokens":10}) #vocab.save_to_files(args.out_dir3)
# coding=utf-8 # @Author: 莫冉 # @Date: 2020-08-06 from allennlp.data.vocabulary import Vocabulary vocab_file = "../data/base_bert/vocab.txt" save_path = "../../../vocab_path" vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]") vocab.set_from_file(vocab_file, is_padded=True, oov_token="[UNK]") vocab.save_to_files(save_path) print(vocab.get_token_index(vocab._oov_token))
self.cuda_device = cuda_device def _extract_data(self, batch) -> np.ndarray: out_dict = self.model(**batch) return expit(tonp(out_dict["class_logits"])) def predict(self, ds: Iterable[Instance]) -> np.ndarray: pred_generator = self.iterator(ds, num_epochs=1, shuffle=False) self.model.eval() pred_generator_tqdm = tqdm(pred_generator, total=self.iterator.get_num_batches(ds)) preds = [] with torch.no_grad(): for batch in pred_generator_tqdm: batch = nn_util.move_to_device(batch, self.cuda_device) preds.append(self._extract_data(batch)) return np.concatenate(preds, axis=0) # Save Model & Results with open("BERT_model.th", 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files("BERT_vocabulary") with open('BERT_metrics.json', "w") as file: json.dump(metrics, file, indent=4) with open('test_predictions_list.pkl', 'wb') as f: pickle.dump(test_prediction_list, f)
# # load data & create vocab # ------------------------------- # loader = IrTupleDatasetReader(lazy=True, source_tokenizer=BlingFireTokenizer(), target_tokenizer=BlingFireTokenizer(), lowercase=args.lowercase) def getInstances(): for file in args.dataset_files: instances = loader.read(file) for i in instances: yield Instance({"text": i["target_tokens"]}) namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(getInstances()): instance.count_vocab_items(namespace_token_counts) for count in [5, 10, 25, 50, 100]: vocab = Vocabulary(namespace_token_counts, min_count={"tokens": count}) vocab.save_to_files(args.out_dir + str(count)) vocab = Vocabulary(namespace_token_counts, min_count={"tokens": 1}) vocab.save_to_files(args.out_dir + "full")
df = pd.DataFrame() df['y_true'] = y_true df['y_pred'] = y_pred df.to_csv(y_true_pred_val_path, index=False) # %% test_list = read_json(cached_path(TEST_PATH)) claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader) y_pred, y_true = [], [] for tst in test_dataset: pred = claim_predictor.predict_instance(tst) logits = torch.FloatTensor(pred['logits']) predicted_labels = pred['labels'] y_pred.extend(predicted_labels) y_true.extend(tst['labels']) y_true = np.array(y_true).astype(int) y_pred = np.array(y_pred).astype(int) print('Test score:', precision_recall_fscore_support(y_true, y_pred, average='binary')) # Save model with open(f"./finetune_model.th", "wb") as f: torch.save(model.state_dict(), f) vocab.save_to_files(f"./finetune_vocab") # Save y_true and y_pred df = pd.DataFrame() df['y_true'] = y_true df['y_pred'] = y_pred df.to_csv(y_true_pred_test_path, index=False)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, variational_autoencoder: FeedForward = None, sentiment_classifier: FeedForward = None, topic_dim: int = 20, freeze_feature_extraction: bool = False, classification_mode: bool = False, pretrained_file: str = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(TopicRNN, self).__init__(vocab, regularizer) self.metrics = { 'cross_entropy': Average(), 'negative_kl_divergence': Average(), 'stopword_loss': Average() } self.classification_mode = classification_mode if classification_mode: self.metrics['sentiment'] = CategoricalAccuracy() if pretrained_file: archive = load_archive(pretrained_file) pretrained_model = archive.model self._init_from_archive(pretrained_model) else: # Model parameter definition. # # Defaults reflect Dieng et al.'s decisions when training their semi-unsupervised # IMDB sentiment classifier. self.text_field_embedder = text_field_embedder self.vocab_size = self.vocab.get_vocab_size("tokens") self.text_encoder = text_encoder self.topic_dim = topic_dim self.vocabulary_projection_layer = TimeDistributed( Linear(text_encoder.get_output_dim(), self.vocab_size)) # Parameter gamma from the paper; projects hidden states into binary logits for whether a # word is a stopword. self.stopword_projection_layer = TimeDistributed( Linear(text_encoder.get_output_dim(), 2)) self.tokens_to_index = vocab.get_token_to_index_vocabulary() # This step should only ever be performed ONCE. # When running allennlp train, the vocabulary will be constructed before the model instantiation, but # we can't create the stopless namespace until we get here. # Check if there already exists a stopless namespace: if so refrain from altering it. if "stopless" not in vocab._token_to_index.keys(): assert self.tokens_to_index[DEFAULT_PADDING_TOKEN] == 0 and \ self.tokens_to_index[DEFAULT_OOV_TOKEN] == 1 for token, _ in self.tokens_to_index.items(): if token not in STOP_WORDS: vocab.add_token_to_namespace(token, "stopless") # Since a vocabulary with the stopless namespace hasn't been saved, save one for convienience. vocab.save_to_files("vocabulary") # Compute stop indices in the normal vocab space to prevent stop words # from contributing to the topic additions. self.stop_indices = torch.LongTensor( [vocab.get_token_index(stop) for stop in STOP_WORDS]) # Learnable topics. # TODO: How should these be initialized? self.beta = nn.Parameter(torch.rand(topic_dim, self.vocab_size)) # mu: The mean of the variational distribution. self.mu_linear = nn.Linear(topic_dim, topic_dim) # sigma: The root standard deviation of the variational distribution. self.sigma_linear = nn.Linear(topic_dim, topic_dim) # noise: used when sampling. self.noise = MultivariateNormal(torch.zeros(topic_dim), torch.eye(topic_dim)) stopless_dim = vocab.get_vocab_size("stopless") self.variational_autoencoder = variational_autoencoder or FeedForward( # Takes as input the word frequencies in the stopless dimension and projects # the word frequencies into a latent topic representation. # # Each latent representation will help tune the variational dist.'s parameters. stopless_dim, 3, [500, 500, topic_dim], torch.nn.ReLU(), ) # The shape for the feature vector for sentiment classification. # (RNN Hidden Size + Inference Network output dimension). sentiment_input_size = text_encoder.get_output_dim() + topic_dim self.sentiment_classifier = sentiment_classifier or FeedForward( # As done by the paper; a simple single layer with 50 hidden units # and sigmoid activation for sentiment classification. sentiment_input_size, 2, [50, 2], torch.nn.Sigmoid(), ) if freeze_feature_extraction: # Freeze the RNN and VAE pipeline so that only the classifier is trained. for name, param in self.named_parameters(): if "sentiment_classifier" not in name: param.requires_grad = False self.sentiment_criterion = nn.CrossEntropyLoss() self.num_samples = 50 initializer(self)
def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': # all_datasets = datasets_from_params(params) corpus = Corpus.from_params(params.pop('corpus')) # datasets_for_vocab_creation = set(params.pop( # "datasets_for_vocab_creation", all_datasets)) # for dataset in datasets_for_vocab_creation: # if dataset not in all_datasets: # raise ConfigurationError( # f"invalid 'dataset_for_vocab_creation' {dataset}") # logger.info("From dataset instances, %s will be considered for vocabulary creation.", # ", ".join(datasets_for_vocab_creation)) seed = params.pop_int("seed", 5678) vocab_params = params.pop("vocabulary", {}) vocab_type = vocab_params.get("type", "default") if vocab_type == 'default' and os.path.exists( os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary")) elif vocab_type == 'empty': vocab = Vocabulary() else: seed_environment(seed) vocab = Vocabulary.from_params(vocab_params, corpus.train) # Need to reset the seed. Otherwise loading existing vocab and creating # vocab from scratch will lead to different behavior. seed_environment(seed) # contextualizer_params = params.pop('contextualizer') # contextualizer = Seq2SeqDecoder.from_params( # vocab=vocab, params=contextualizer_params) model = Model.from_params(vocab=vocab, params=params.pop('model')) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None # train_data = all_datasets['train'] # validation_data = all_datasets.get('validation') # test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names( model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) batch_weight_key = params.pop('batch_weight_key', '') return TrainerPieces(model, iterator, corpus, validation_iterator, batch_weight_key, trainer_params)