def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'directory_path' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = self.TEST_DIR / 'vocab_save' vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'vocabulary_directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def test_custom_padding_oov_tokens(self): vocab = Vocabulary(oov_token="[UNK]") assert vocab._oov_token == "[UNK]" assert vocab._padding_token == "@@PADDING@@" vocab = Vocabulary(padding_token="[PAD]") assert vocab._oov_token == "@@UNKNOWN@@" assert vocab._padding_token == "[PAD]" vocab_dir = self.TEST_DIR / "vocab_save" vocab = Vocabulary(oov_token="<UNK>") vocab.add_tokens_to_namespace(["a0", "a1", "a2"], namespace="a") vocab.save_to_files(vocab_dir) params = Params({ "type": "from_files", "directory": vocab_dir, "oov_token": "<UNK>" }) vocab = Vocabulary.from_params(params) with pytest.raises(AssertionError) as excinfo: vocab = Vocabulary.from_params( Params({ "type": "from_files", "directory": vocab_dir })) assert "OOV token not found!" in str(excinfo.value)
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch( [Instance({ "text1": text_field1, "text2": text_field2 })]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": [] }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"] }) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({ "directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"] }) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances)
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": []}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"]}) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"]}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
def setUp(self): super().setUp() params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def setUp(self): super(TestCopyNetReader, self).setUp() params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def test_max_vocab_size_dict(self): params = Params({"max_vocab_size": {"tokens": 1, "characters": 20}}) vocab = Vocabulary.from_params(params=params, instances=self.dataset) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == 3
def evaluate_from_file(archive_path, model_path, overrides=None, eval_suffix='', device=0): if archive_path.endswith('gz'): archive = load_archive(archive_path, device, overrides) config = archive.config prepare_environment(config) model = archive.model serialization_dir = os.path.dirname(archive_path) elif archive_path.endswith('yaml'): config = yaml_to_params(archive_path, overrides) prepare_environment(config) config_dir = os.path.dirname(archive_path) serialization_dir = os.path.join(config_dir, 'serialization') all_datasets = datasets_from_params(config) # We want to create the vocab from scratch since it might be of a # different type. Vocabulary.from_files will always create the base # Vocabulary instance. # if os.path.exists(os.path.join(serialization_dir, "vocabulary")): # vocab_path = os.path.join(serialization_dir, "vocabulary") # vocab = Vocabulary.from_files(vocab_path) vocab = Vocabulary.from_params(config.pop('vocabulary')) model = Model.from_params(vocab=vocab, params=config.pop('model')) if model_path: best_model_state = torch.load(model_path) model.load_state_dict(best_model_state) instances = all_datasets.get('test') iterator = DataIterator.from_params(config.pop("validation_iterator")) iterator.index_with(model.vocab) model.eval().to(device) model.evaluate_mode = True metrics = evaluate(model, instances, iterator, device, serialization_dir, eval_suffix, batch_weight_key='') logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) output_file = os.path.join(serialization_dir, f"evaluate-metrics{eval_suffix}.json") if output_file: with open(output_file, "w") as file: json.dump(metrics, file, indent=4) return metrics
def test_from_params(self): params = Params.from_file(N_GRAM_PARAMS) vocabulary_params = params.pop("vocab") dataset = self.model.reader.read(TRAIN_EXAMPLE) vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset) model = LanguageModel.from_params(params, vocab=vocabulary) self.assertTrue(isinstance(model, NGramLanguageModel))
def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': all_datasets = training_util.datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return TrainerPieces(model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params)
def main(config: str, model_th: str, dataset: str, hypo_file: str, ref_file: str, batch_size: int, no_gpu: bool): logger = logging.getLogger(__name__) logger.info("Loading configuration parameters") params = Params.from_file(config) vocab_params = params.pop("vocabulary") vocab = Vocabulary.from_params(vocab_params) reader_params = params.pop("dataset_reader") reader_name = reader_params.pop("type") # reader_params["lazy"] = True # make sure we do not load the entire dataset reader = DatasetReader.by_name(reader_name).from_params(reader_params) logger.info("Reading data from {}".format(dataset)) data = reader.read(dataset) iterator = BasicIterator(batch_size=batch_size) iterator.index_with(vocab) batches = iterator._create_batches(data, shuffle=False) logger.info("Loading model") model_params = params.pop("model") model_name = model_params.pop("type") model = Model.by_name(model_name).from_params(model_params, vocab=vocab) if not no_gpu: model.cuda(0) with open(model_th, 'rb') as f: if no_gpu: state_dict = torch.load(f, map_location=torch.device('cpu')) else: state_dict = torch.load(f) model.load_state_dict(state_dict) predictor = Seq2SeqPredictor(model, reader) model.eval() with open(hypo_file, 'w') as hf, open(ref_file, 'w') as rf: logger.info("Generating predictions") for sample in tqdm(batches): s = list(sample) pred = predictor.predict_batch_instance(s) for inst, p in zip(s, pred): print( " ".join(p["predicted_tokens"][0]), file=hf ) print( " ".join(t.text for t in inst["target_tokens"][1:-1]), file=rf )
def setup_method(self): super().setup_method() params = Params.from_file(FIXTURES_ROOT / "generation" / "copynet" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(FIXTURES_ROOT / "generation" / "copynet" / "data" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def preprocess(train_path, vocabulary_path, config_path): params = Params.from_file(config_path) reader_params = params.pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) dataset = reader.read(train_path) vocabulary_params = params.pop("vocabulary", default=Params({})) vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset) vocabulary.save_to_files(vocabulary_path)
def setUp(self): super(TestCopyNetReader, self).setUp() params = Params.from_file( "nlpete/tests/fixtures/copynet/experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read( "nlpete/tests/fixtures/copynet/copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def test_min_pretrained_embeddings(self): params = Params({ "pretrained_files": { "tokens": str(self.FIXTURES_ROOT / "embeddings/glove.6B.100d.sample.txt.gz") }, "min_pretrained_embeddings": {"tokens": 50}, }) vocab = Vocabulary.from_params(params=params, instances=self.dataset) assert vocab.get_vocab_size() >= 50 assert vocab.get_token_index("his") > 1 # not @@UNKNOWN@@
def main(config: str, model_th: str, dataset: str, out_file): logger = logging.getLogger(__name__) logger.info("Loading model and data") params = Params.from_file(config) vocab_params = params.pop("vocabulary") vocab = Vocabulary.from_params(vocab_params) reader_params = Params({ "source_token_indexers": { "tokens": { "type": "single_id", "namespace": "tokens" } }, "target_namespace": "tokens" }) # reader_name = reader_params.pop("type") # reader_params["lazy"] = True # make sure we do not load the entire dataset reader = UnsupervisedBTReader.from_params(reader_params) logger.info("Reading data from {}".format(dataset)) data = reader.read(dataset) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) batches = iterator._create_batches(data, shuffle=False) logger.info("Loading model") model_params = params.pop("model") model_name = model_params.pop("type") model = Model.by_name(model_name).from_params(model_params, vocab=vocab) model.cuda(0) with open(model_th, 'rb') as f: model.load_state_dict(torch.load(f)) predictor = Seq2SeqPredictor(model, reader) model.eval() line_id = 0 writer = csv.writer(out_file, delimiter="\t") logger.info("Generating predictions") for sample in tqdm(batches): s = list(sample) pred = predictor.predict_batch_instance(s) for inst, p in zip(s, pred): writer.writerow((line_id, " ".join( (t.text for t in inst["source_tokens"][1:-1])), " ".join(p["predicted_tokens"][0]))) line_id += 1
def test_max_vocab_size_dict(self): params = Params({ "max_vocab_size": { "tokens": 1, "characters": 20 } }) vocab = Vocabulary.from_params(params=params, instances=self.dataset) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == 3
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"type": "extend", "directory": vocab_dir}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory` key must be present in params, # or else there is nothing to extend from. params = Params({"type": "extend"}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory_path`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"directory_path": vocab_dir, "extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory_path` key must be present in params, # or else there is nothing to extend from. params = Params({"extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances)
def create_or_extend_vocab( params: Params, datasets: Dict[str, Dict[str, Iterable[Instance]]], vocabulary_params: Params, vocabulary_path: str, vocab: Vocabulary = None, recover: bool = False, ) -> Vocabulary: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", datasets)) for key in datasets_for_vocab_creation: if key not in datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {key}") datasets = { key: dataset for key, dataset in datasets.items() if key in datasets_for_vocab_creation } flat_datasets = training_util.as_flat_dict(datasets) instance_generator = (instance for key, dataset in flat_datasets.items() for instance in dataset) dataset_keys_to_use_str = ", ".join(datasets_for_vocab_creation) if vocab: logger.info( f"Extending model vocabulary using {dataset_keys_to_use_str} data." ) vocab.extend_from_instances(instances=instance_generator) else: logger.info( "From dataset instances, %s will be considered for vocabulary creation.", dataset_keys_to_use_str, ) if recover and os.path.exists(vocabulary_path): vocab = Vocabulary.from_files( vocabulary_path, vocabulary_params.get("padding_token", None), vocabulary_params.get("oov_token", None), ) else: # Using a generator comprehension here is important because, by being lazy, # it allows us to not iterate over the dataset when directory_path is specified. vocab = Vocabulary.from_params(vocabulary_params, instances=instance_generator) return vocab
def preprocess(train_path, vocabulary_path, config_path): assert os.path.isfile(train_path), "Train dataset file does not exist" assert os.path.isfile(config_path), "Config file does not exist" params = Params.from_file(config_path) reader_params = params.pop("reader", default=Params({})) vocabulary_params = params.pop("vocabulary", default=Params({})) reader = DatasetReader.from_params(reader_params) dataset = reader.read(train_path) vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset) vocabulary.save_to_files(vocabulary_path)
def test_from_params_adds_tokens_to_vocab(self): vocab = Vocabulary.from_params( Params({"tokens_to_add": {"tokens": ["q", "x", "z"]}}), instances=self.dataset ) assert vocab.get_index_to_token_vocabulary("tokens") == { 0: "@@PADDING@@", 1: "@@UNKNOWN@@", 2: "a", 3: "c", 4: "b", 5: "q", 6: "x", 7: "z", }
def test_registrability(self): @Vocabulary.register("my-vocabulary", constructor="constructor") class MyVocabulary(Vocabulary): @classmethod def constructor(cls): return MyVocabulary() params = Params({"type": "my-vocabulary"}) instance = Instance(fields={}) vocab = Vocabulary.from_params(params=params, instances=[instance]) assert isinstance(vocab, MyVocabulary)
def main(config: str, model_th: str, dataset: str, seed: int): logger = logging.getLogger(__name__) logger.info("Loading model and data") params = Params.from_file(config) vocab_params = params.pop("vocabulary") vocab = Vocabulary.from_params(vocab_params) reader_params = params.pop("dataset_reader") reader_name = reader_params.pop("type") reader_params["lazy"] = True # make sure we do not load the entire dataset reader = DatasetReader.by_name(reader_name).from_params(reader_params) data = reader.read(dataset) iterator = BasicIterator(batch_size=10) iterator.index_with(vocab) batches = iterator._create_batches(data, shuffle=False) model_params = params.pop("model") model_name = model_params.pop("type") model = Model.by_name(model_name).from_params(model_params, vocab=vocab) # model.cuda(cuda_device) with open(model_th, 'rb') as f: model.load_state_dict(torch.load(f)) predictor = Seq2SeqPredictor(model, reader) model.eval() logger.info("Generating predictions") random.seed(seed) samples = [] for b in batches: samples.append(b) if random.random() > 0.6: break sample = list(random.choice(samples)) pred = predictor.predict_batch_instance(sample) for inst, p in zip(sample, pred): print() print("SOURCE:", " ".join([t.text for t in inst["source_tokens"]])) print("GOLD:", " ".join([t.text for t in inst["target_tokens"]])) print("GEN:", p["predicted_tokens"])
def main(config: str): logger = logging.getLogger(__name__) logger.info("Loading model and data") params = Params.from_file(config) vocab_params = params.pop("vocabulary") vocab = Vocabulary.from_params(vocab_params) logger.info("Loading model") model_params = params.pop("model") model_name = model_params.pop("type") model = Model.by_name(model_name).from_params(model_params, vocab=vocab) print("Number of parameters:", count_parameters(model))
def test_from_params_adds_tokens_to_vocab(self): vocab = Vocabulary.from_params( Params({u'tokens_to_add': { u'tokens': [u'q', u'x', u'z'] }}), self.dataset) assert vocab.get_index_to_token_vocabulary(u"tokens") == { 0: u'@@PADDING@@', 1: u'@@UNKNOWN@@', 2: u'a', 3: u'c', 4: u'b', 5: u'q', 6: u'x', 7: u'z' }
def test_registrability(self): @Vocabulary.register('my-vocabulary') class MyVocabulary: @classmethod def from_params(cls, params, instances=None): # pylint: disable=unused-argument return MyVocabulary() params = Params({'type': 'my-vocabulary'}) instance = Instance(fields={}) vocab = Vocabulary.from_params(params=params, instances=[instance]) assert isinstance(vocab, MyVocabulary)
def test_from_params_adds_tokens_to_vocab(self): vocab = Vocabulary.from_params( Params({'tokens_to_add': { 'tokens': ['q', 'x', 'z'] }}), self.dataset) assert vocab.get_index_to_token_vocabulary("tokens") == { 0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b', 5: 'q', 6: 'x', 7: 'z' }
def test_max_vocab_size_partial_dict(self): indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()} instance = Instance({ 'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers) }) dataset = Batch([instance]) params = Params({ "max_vocab_size": { "tokens": 1 } }) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
def test_max_vocab_size_partial_dict(self): indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer(min_padding_length=3)} instance = Instance({ 'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers) }) dataset = Batch([instance]) params = Params({ "max_vocab_size": { "tokens": 1 } }) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
def _test_model(self, file_name): params = self.params[file_name].duplicate() reader_params = params.duplicate().pop("reader", default=Params({})) if reader_params["type"] == "cnn_dailymail": reader_params["cnn_tokenized_dir"] = TEST_STORIES_DIR dataset_file = TEST_URLS_FILE elif reader_params["type"] == "ria": dataset_file = RIA_EXAMPLE_FILE else: assert False reader = DatasetReader.from_params(reader_params) tokenizer = reader._tokenizer dataset = reader.read(dataset_file) vocabulary_params = params.pop("vocabulary", default=Params({})) vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset) model_params = params.pop("model") model = Model.from_params(model_params, vocab=vocabulary) print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) iterator = DataIterator.from_params(params.pop('iterator')) iterator.index_with(vocabulary) trainer = Trainer.from_params(model, None, iterator, dataset, None, params.pop('trainer')) trainer.train() model.eval() predictor = Seq2SeqPredictor(model, reader) for article, reference_sents in reader.parse_set(dataset_file): ref_words = [ token.text for token in tokenizer.tokenize(reference_sents) ] decoded_words = predictor.predict(article)["predicted_tokens"] self.assertGreaterEqual(len(decoded_words), len(ref_words)) unk_count = 0 while DEFAULT_OOV_TOKEN in decoded_words: unk_index = decoded_words.index(DEFAULT_OOV_TOKEN) decoded_words.pop(unk_index) unk_count += 1 if unk_index < len(ref_words): ref_words.pop(unk_index) self.assertLess(unk_count, 5) self.assertListEqual(decoded_words[:len(ref_words)], ref_words)
def make_vocab(serialization_dir: str, recover: bool, all_datasets): if recover and os.path.exists( os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), # Using a generator comprehension here is important # because, being lazy, it allows us to not iterate over the # dataset when directory_path is specified. (instance for key, dataset in all_datasets.items() if key in datasets_for_vocab_creation for instance in dataset), ) return vocab
def get_model_from_file(archive_path, model_path, overrides=None, eval_suffix='', device=0): if archive_path.endswith('gz'): archive = load_archive(archive_path, device, overrides) config = archive.config prepare_environment(config) model = archive.model serialization_dir = os.path.dirname(archive_path) elif archive_path.endswith('yaml'): config = yaml_to_params(archive_path, overrides) prepare_environment(config) config_dir = os.path.dirname(archive_path) serialization_dir = os.path.join(config_dir, 'serialization') all_datasets = datasets_from_params(config) # We want to create the vocab from scratch since it might be of a # different type. Vocabulary.from_files will always create the base # Vocabulary instance. if os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab_path = os.path.join(serialization_dir, "vocabulary") vocab = Vocabulary.from_files(vocab_path) vocab = Vocabulary.from_params(config.pop('vocabulary')) model = Model.from_params(vocab=vocab, params=config.pop('model')) if model_path: best_model_state = torch.load(model_path) model.load_state_dict(best_model_state) # instances = all_datasets.get('test') iterator = DataIterator.from_params(config.pop("validation_iterator")) iterator.index_with(model.vocab) model.eval().to(device) model.evaluate_mode = True return model
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> "MultiTaskTrainer": readers = { name: DatasetReader.from_params(reader_params) for name, reader_params in params.pop( "train_dataset_readers").items() } train_file_paths = params.pop("train_file_paths").as_dict() datasets = { name: reader.read(train_file_paths[name]) for name, reader in readers.items() } instances = (instance for dataset in datasets.values() for instance in dataset) vocab = Vocabulary.from_params(Params({}), instances=instances) model = Model.from_params(params.pop("model"), vocab=vocab) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) mingler = DatasetMingler.from_params(params.pop("mingler")) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) num_epochs = params.pop_int("num_epochs", 10) _ = params.pop("trainer", Params({})) params.assert_empty(__name__) return MultiTaskTrainer(model, serialization_dir, iterator, mingler, optimizer, datasets, num_epochs)
def setUpClass(cls): LanguageModel.set_seed(42) configs = (ENCODER_ONLY_MODEL_PARAMS, ENCODER_ONLY_SAMPLED_SOFTMAX_MODEL_PARAMS) cls.params_sets = [Params.from_file(config) for config in configs] cls.vocabularies = [] for params in cls.params_sets: vocabulary_params = params.pop("vocabulary", default=Params({})) reader_params = params.duplicate().pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) dataset = reader.read(REMEMBERING_EXAMPLE) cls.vocabularies.append( Vocabulary.from_params(vocabulary_params, instances=dataset)) cls.train_vocabulary = Vocabulary.from_files(TRAIN_VOCAB_EXAMPLE) cls.sentences = [] with open(REMEMBERING_EXAMPLE, "r", encoding="utf-8") as r: for line in r: cls.sentences.append(line.strip())
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' extension_ways = ["from_params", "extend_from_instances"] # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("d", namespace="tokens") original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.add_token_to_namespace("b", namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index("c", "tokens") # should be present assert extended_vocab.get_token_index("e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2 text_field = TextField([Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def test_from_params_adds_tokens_to_vocab(self): vocab = Vocabulary.from_params(Params({'tokens_to_add': {'tokens': ['q', 'x', 'z']}}), self.dataset) assert vocab.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b', 5: 'q', 6: 'x', 7: 'z'}
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token