def test_knowbert_wiki_wordnet(self): from kb.testing import get_bert_pretraining_reader_with_kg reader = get_bert_pretraining_reader_with_kg( mask_candidate_strategy='full_mask', masked_lm_prob=0.35, include_wiki=True) instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt") vocab = Vocabulary.from_params( Params({ "directory_path": "tests/fixtures/wordnet_wiki_vocab", })) iterator = BasicIterator() iterator.index_with(vocab) for batch in iterator(instances, num_epochs=1, shuffle=False): pass # hack, incompatitable fixtures... batch['tokens']['tokens'] = torch.min(batch['tokens']['tokens'], torch.tensor([17])) batch['lm_label_ids']['lm_labels'] = torch.min( batch['lm_label_ids']['lm_labels'], torch.tensor([17])) model = get_knowbert(vocab, None, include_wiki=True) output = model(**batch) loss = output['loss'] loss.backward() self.assertTrue(True)
def make_vocab_from_params(params: Params, serialization_dir: str): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [ instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation ] vocab = Vocabulary.from_params(vocab_params, instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = list(reader.read(str(dataset_file))) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def setup_vocab(params): all_datasets = setup_datasets(params) vocab: Vocabulary = Vocabulary.from_params( params.get("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset) ) return vocab
def write_for_official_eval(model_archive_file, test_file, output_file, label_ids_to_label): archive = load_archive(model_archive_file) model = archive.model reader = DatasetReader.from_params(archive.config['dataset_reader']) iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 4})) vocab = Vocabulary.from_params(archive.config['vocabulary']) iterator.index_with(vocab) model.cuda() model.eval() instances = reader.read(test_file) predictions = [] for batch in iterator(instances, num_epochs=1, shuffle=False): batch = move_to_device(batch, cuda_device=0) output = model(**batch) batch_labels = [ label_ids_to_label[i] for i in output['predictions'].cpu().numpy().tolist() ] predictions.extend(batch_labels) with open(output_file, 'w') as fout: for p in predictions: fout.write("{}\n".format(p))
def make_vocab_from_params(params: Params): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) vocab_dir = vocab_params.get('directory_path') if vocab_dir is None: raise ConfigurationError("To use `make-vocab` your configuration must contain a value " "at vocabulary.directory_path") os.makedirs(vocab_dir, exist_ok=True) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(Params({}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init initial_working_dir = os.getcwd() # Change directory to module root. os.chdir(self.MODULE_ROOT) self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(self.vocab, params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab) # Change directory back to what it was initially os.chdir(initial_working_dir)
def make_vocab_from_params(params: Params): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) vocab_dir = vocab_params.get('directory_path') if vocab_dir is None: raise ConfigurationError( "To use `make-vocab` your configuration must contain a value " "at vocabulary.directory_path") os.makedirs(vocab_dir, exist_ok=True) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(Params( {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def setUp(self) -> None: super().setUp() param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet" dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt" self.param_file = param_file params = Params.from_file(self.param_file) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(str(dataset_file)) # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params( params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) # 加载模型 self.model = Model.from_params(params=params["model"], vocab=self.vocab) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab) self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
def setUp(self): super().setUp() params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "iterator": { "type": "basic", "batch_size": 2 }, "trainer": { "cuda_device": -1, "num_epochs": 2, "optimizer": "adam" }, }) all_datasets = datasets_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for dataset in all_datasets.values() for instance in dataset), ) model = Model.from_params(vocab=vocab, params=params.pop("model")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets["train"] trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate") self.trainer = TrainerBase.from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, params=trainer_params, validation_data=None, validation_iterator=None, )
def make_vocab_from_params(params: Params, serialization_dir: str): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def test_no_eval(self): reader = KGTupleReader() instances = reader.read('tests/fixtures/kg_embeddings/wn18rr_train.txt') self.assertTrue(len(instances) == 8) # create the vocab and index to make sure things look good vocab = Vocabulary.from_params(Params({}), instances) # (+2 for @@PADDING@@ and @@UNKNOWN@@ self.assertEqual(vocab.get_vocab_size("entity"), 5 + 2) self.assertEqual(vocab.get_vocab_size("relation"), 4 + 2) # now get a batch iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) for batch in iterator(instances, num_epochs=1, shuffle=False): pass # check it! expected_entity = [1, 2, 1, 3, 3, 4, 1, 5] expected_relation = ['_hypernym', '_hypernym_reverse', '_derivationally_related_form', '_derivationally_related_form_reverse', '_hypernym_reverse', '_hypernym', '_hypernym_reverse', '_hypernym_reverse'] expected_entity2 = [[2, 3], [1], [3], [1], [1], [1, 5], [4], [4]] self._check_batch(batch, vocab, expected_entity, expected_relation, expected_entity2)
def make_vocab_from_params( params: Params, serialization_dir: str, print_statistics: bool = False ) -> Vocabulary: vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError( "The 'vocabulary' directory in the provided serialization directory is non-empty" ) datasets_for_vocab_creation: Optional[List[str]] = params.pop( "datasets_for_vocab_creation", None ) # Do a quick sanity check here. There's no need to load any datasets if the vocab # type is "empty". if datasets_for_vocab_creation is None and vocab_params.get("type") in ("empty", "from_files"): datasets_for_vocab_creation = [] datasets: Dict[str, Dataset] if datasets_for_vocab_creation is None: # If `datasets_for_vocab_creation` was not specified, we'll use all datasets # from the config. datasets = datasets_from_params(params) else: for dataset_name in datasets_for_vocab_creation: data_path = f"{dataset_name}_data_path" if data_path not in params: raise ConfigurationError(f"invalid 'datasets_for_vocab_creation' {dataset_name}") datasets = datasets_from_params( params, train=("train" in datasets_for_vocab_creation), validation=("validation" in datasets_for_vocab_creation), test=("test" in datasets_for_vocab_creation), ) instances: Iterable[Instance] = ( instance for key, dataset in datasets.items() if datasets_for_vocab_creation is None or key in datasets_for_vocab_creation for instance in dataset ) if print_statistics: instances = list(instances) vocab = Vocabulary.from_params(vocab_params, instances=instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab") if print_statistics: dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() return vocab
def set_up_model(self, param_file, dataset_file): self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params["dataset_reader"]) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = reader.read(str(dataset_file)) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) self.model = Model.from_params(vocab=self.vocab, params=params["model"]) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab)
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [ instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation ] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) stdout_handler = prepare_global_logging(serialization_dir, False) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) cleanup_global_logging(stdout_handler)
def tasks_and_vocab_from_params( params: Params, serialization_dir: str) -> Tuple[List[Task], Vocabulary]: """ Load each of the tasks in the model from the ``params`` file and load the datasets associated with each of these task. Create the vocavulary from ``params`` using the concatenation of the ``datasets_for_vocab_creation`` from each of the task specific dataset. Parameters ---------- params: ``Params`` A parameter object specifing an experiment. serialization_dir: ``str`` Directory in which to save the model and its logs. Returns ------- task_list: ``List[Task]`` A list containing the tasks of the model to train. vocab: ``Vocabulary`` The vocabulary fitted on the datasets_for_vocab_creation. """ ### Instantiate the different tasks ### task_list = [] instances_for_vocab_creation = itertools.chain() datasets_for_vocab_creation = {} task_keys = [key for key in params.keys() if re.search("^task_", key)] for key in task_keys: logger.info("Creating %s", key) task_params = params.pop(key) task_description = task_params.pop("task_description") task_data_params = task_params.pop("data_params") task = Task.from_params(params=task_description) task_list.append(task) task_instances_for_vocab, task_datasets_for_vocab = task.load_data_from_params( params=task_data_params) instances_for_vocab_creation = itertools.chain( instances_for_vocab_creation, task_instances_for_vocab) datasets_for_vocab_creation[task._name] = task_datasets_for_vocab ### Create and save the vocabulary ### for task_name, task_dataset_list in datasets_for_vocab_creation.items(): logger.info("Creating a vocabulary using %s data from %s.", ", ".join(task_dataset_list), task_name) logger.info("Fitting vocabulary from dataset") vocab = Vocabulary.from_params(params.pop("vocabulary", {}), instances_for_vocab_creation) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) logger.info("Vocabulary saved to %s", os.path.join(serialization_dir, "vocabulary")) return task_list, vocab
def test_reader(self): reader = get_reader(masked_lm_prob=0.15) np.random.seed(5) instances = reader.read("tests/fixtures/bert_pretraining/shard1.txt") vocab = Vocabulary.from_params(Params({ "directory_path": "tests/fixtures/bert/vocab_dir_with_entities_for_tokenizer_and_generator" })) iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(vocab) for batch in iterator(instances, num_epochs=1, shuffle=False): break actual_tokens_ids = batch['tokens']['tokens'] expected_tokens_ids = torch.tensor( [[16, 18, 19, 20, 1, 19, 21, 13, 17, 21, 3, 4, 12, 13, 17], [16, 1, 13, 17, 21, 1, 1, 13, 17, 0, 0, 0, 0, 0, 0]]) self.assertEqual(actual_tokens_ids.tolist(), expected_tokens_ids.tolist()) actual_entities = batch['candidates']['wordnet']['candidate_entities']['ids'] expected_entities = torch.tensor( [[[29, 30], [31, 0], [31, 0]], [[ 0, 0], [ 0, 0], [ 0, 0]]]) self.assertEqual(actual_entities.tolist(), expected_entities.tolist()) expected_spans = torch.tensor( [[[ 1, 3], [ 2, 3], [ 5, 6]], [[-1, -1], [-1, -1], [-1, -1]]]) actual_spans = batch['candidates']['wordnet']['candidate_spans'] self.assertEqual(actual_spans.tolist(), expected_spans.tolist()) expected_lm_labels = torch.tensor( [[ 0, 0, 0, 0, 0, 0, 20, 0, 0, 2, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) actual_lm_labels = batch['lm_label_ids']['lm_labels'] self.assertEqual(actual_lm_labels.tolist(), expected_lm_labels.tolist()) expected_segment_ids = torch.tensor( [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]) self.assertEqual(batch['segment_ids'].tolist(), expected_segment_ids.tolist()) self.assertTrue(batch['segment_ids'].dtype == torch.long)
def test_multi_iterator(self): params, file_paths = get_dataset_params_paths(['ner', 'ccg']) multitask_reader = DatasetReader.from_params(params) dataset = multitask_reader.read(file_paths) iterator_params = Params({ "type": "multitask_iterator", "iterators": { "ner": { "type": "bucket", "sorting_keys": [["tokens", "num_tokens"]], "padding_noise": 0.0, "batch_size": 2 }, "ccg": { "type": "basic", "batch_size": 1 } }, "names_to_index": ["ner", "ccg"], }) multi_iterator = DataIterator.from_params(iterator_params) # make the vocab vocab = Vocabulary.from_params(Params({}), (instance for instance in dataset)) multi_iterator.index_with(vocab) all_batches = [] for epoch in range(2): all_batches.append([]) for batch in multi_iterator(dataset, shuffle=True, num_epochs=1): all_batches[-1].append(batch) # 3 batches per epoch - self.assertEqual([len(b) for b in all_batches], [3, 3]) ner_batches = [] ccg_batches = [] for epoch_batches in all_batches: ner_batches.append(0) ccg_batches.append(0) for batch in epoch_batches: if 'original_pos_tags' not in batch: ner_batches[-1] += 1 if 'original_pos_tags' in batch: ccg_batches[-1] += 1 # 1 NER batch per epoch, 2 CCG per epoch self.assertEqual(ner_batches, [1, 1]) self.assertEqual(ccg_batches, [2, 2])
def setUp(self): super().setUp() params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "cuda_device": -1, "num_epochs": 2, "optimizer": "adam" } }) all_datasets = datasets_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for dataset in all_datasets.values() for instance in dataset) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate') self.trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None)
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name)
def setUp(self) -> None: super().setUp() param_file = FIXTURES_ROOT / "pointer_rewrite" / "bert_transformer_pointer_rewrite.jsonnet" dataset_file = FIXTURES_ROOT / "test_pointer_rewrite.txt" self.param_file = param_file params = Params.from_file(self.param_file) # 构建适用于bert model的词表,和vocabulary词表保持一致 vocab_path = params["dataset_reader"]["vocab_path"] # 新生成的bert词表的路径 bert_temp_dir = tempfile.mkdtemp(suffix="bert") with open(Path(vocab_path) / "tokens.txt", 'r', encoding="utf-8") as f, \ open(Path(bert_temp_dir) / "vocab.txt", 'w', encoding="utf-8") as fp: fp.write("[PAD]" + "\n") for line in f: line = line.strip() fp.write(line) fp.write("\n") # 改写config中的部分参数 overrides_config = { "dataset_reader.model_name": bert_temp_dir, "model.model_name": params["model"]["model_name"] + "/config.json" } self.overrides_config = json.dumps(overrides_config) params = Params.from_file(self.param_file, params_overrides=self.overrides_config) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(str(dataset_file)) # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.instances.index_with(vocab) # 加载模型 # 将模型对应的model_name改成对应的config文件 self.model = Model.from_params(params=params["model"], vocab=self.vocab) self.dataset = Batch(list(self.instances)) self.dataset.index_instances(self.vocab) self.TEST_DIR = Path(tempfile.mkdtemp(prefix="allennlp_tests"))
def make_vocab_from_params( params: Params, serialization_dir: str, print_statistics: bool = False ) -> Vocabulary: vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError( "The 'vocabulary' directory in the provided serialization directory is non-empty" ) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) instances: Iterable[Instance] = ( instance for key, dataset in all_datasets.items() if key in datasets_for_vocab_creation for instance in dataset ) if print_statistics: instances = list(instances) vocab = Vocabulary.from_params(vocab_params, instances=instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab") if print_statistics: dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() return vocab
def get_knowbert_model(): vocab = Vocabulary.from_params( Params({ "directory_path": "tests/fixtures/kg_embeddings/tucker_wordnet/vocabulary", })) params = Params({ "type": "knowbert", "soldered_kgs": { "wordnet": { "type": "soldered_kg", "entity_linker": { "type": "entity_linking_with_candidate_mentions", "kg_model": { "type": "from_archive", "archive_file": "tests/fixtures/kg_embeddings/tucker_wordnet/model.tar.gz", }, "contextual_embedding_dim": 12, "max_sequence_length": 64, "span_encoder_config": { "hidden_size": 24, "num_hidden_layers": 1, "num_attention_heads": 3, "intermediate_size": 37 }, }, "span_attention_config": { "hidden_size": 24, "num_hidden_layers": 2, "num_attention_heads": 4, "intermediate_size": 55 } }, }, "soldered_layers": { "wordnet": 1 }, "bert_model_name": "tests/fixtures/bert/bert_test_fixture.tar.gz", }) model = Model.from_params(params, vocab=vocab) return model, vocab
def setup_model(params_file, dataset_file): params = Params.from_file(params_file) #reader = DatasetReader.from_params(params['dataset_reader']) reader = ToxicReader() instances = reader.read(str(dataset_file)) Vocabulary.from_instances(instances) if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) vocab.save_to_files("new_vocab2") dataset = Batch(instances) dataset.index_instances(vocab) print(dataset.as_tensor_dict())
def __init__(self, model_archive, batch_size=32, masking_strategy=None, wordnet_entity_file=None, vocab_dir=None): # get bert_tokenizer_and_candidate_generator if os.path.isdir(model_archive): config = Params.from_file( os.path.join(model_archive, 'config.json')) else: config = _extract_config_from_archive(cached_path(model_archive)) # look for the bert_tokenizers and candidate_generator candidate_generator_params = _find_key( config['dataset_reader'].as_dict(), 'tokenizer_and_candidate_generator') if wordnet_entity_file is not None: candidate_generator_params['entity_candidate_generators'][ 'wordnet']['entity_file'] = wordnet_entity_file self.tokenizer_and_candidate_generator = TokenizerAndCandidateGenerator.\ from_params(Params(candidate_generator_params)) self.tokenizer_and_candidate_generator.whitespace_tokenize = False assert masking_strategy is None or masking_strategy == 'full_mask' self.masking_strategy = masking_strategy # need bert_tokenizer_and_candidate_generator if vocab_dir is not None: vocab_params = Params({"directory_path": vocab_dir}) else: vocab_params = config['vocabulary'] self.vocab = Vocabulary.from_params(vocab_params) self.iterator = DataIterator.from_params( Params({ "type": "basic", "batch_size": batch_size })) self.iterator.index_with(self.vocab)
def get_wsd_reader(is_training, use_bert_indexer=False, wordnet_entity_file=None): if wordnet_entity_file is None: wordnet_entity_file = "tests/fixtures/wordnet/entities_cat_hat.jsonl" if use_bert_indexer: bert_fixtures = get_bert_test_fixture() indexer_params = bert_fixtures["indexer_params"] else: indexer_params = {"type": "single_id", "lowercase_tokens": True} reader_params = { "type": "wordnet_fine_grained", "wordnet_entity_file": wordnet_entity_file, "token_indexers": { "tokens": indexer_params, }, "entity_indexer": { "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity" }, "is_training": is_training, "use_surface_form": False } reader = DatasetReader.from_params(Params(reader_params)) vocab_params = { "directory_path": "tests/fixtures/wordnet/cat_hat_vocabdir" } vocab = Vocabulary.from_params(Params(vocab_params)) iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(vocab) return reader, vocab, iterator
def test_bert_transformer_predictor(self): param_file = FIXTURES_ROOT / "pointer_rewrite" / "bert_transformer_pointer_rewrite.jsonnet" params = Params.from_file(param_file) # 构建适用于bert model的词表,和vocabulary词表保持一致 vocab_path = params["dataset_reader"]["vocab_path"] # 新生成的bert词表的路径 bert_temp_dir = tempfile.mkdtemp(suffix="bert") with open(Path(vocab_path) / "tokens.txt", 'r', encoding="utf-8") as f, \ open(Path(bert_temp_dir) / "vocab.txt", 'w', encoding="utf-8") as fp: fp.write("[PAD]"+"\n") for line in f: line = line.strip() fp.write(line) fp.write("\n") # 改写config中的部分参数 overrides_config = { "dataset_reader.model_name": bert_temp_dir, "model.model_name": params["model"]["model_name"] + "/config.json" } overrides_config = json.dumps(overrides_config) # 重新加载参数并重写其中部分参数 params = Params.from_file(param_file, params_overrides=overrides_config) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params) else: vocab = Vocabulary() # 加载模型 # 将模型对应的model_name改成对应的config文件 model = Model.from_params(params=params["model"], vocab=vocab) predictor = PointerRewritePredictor(dataset_reader=reader, model=model) result = predictor.predict(self.context, self.query) self.assertTrue("rewrite_results" in result) assert isinstance(result["rewrite_results"], str)
def test_lstm_lstm_predictor(self): param_file = FIXTURES_ROOT / "pointer_rewrite" / "lstm_lstm_pointer_rewrite.jsonnet" params = Params.from_file(param_file) # 获取reader reader = DatasetReader.from_params(params["dataset_reader"]) # 获取模型 # 如果存在词表的参数,则加载词表 if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params) else: vocab = Vocabulary() # 加载模型 model = Model.from_params(params=params["model"], vocab=vocab) predictor = PointerRewritePredictor(dataset_reader=reader, model=model) result = predictor.predict(self.context, self.query) self.assertTrue("rewrite_results" in result) assert isinstance(result["rewrite_results"], str)
def test_kg_reader_with_eval(self): train_file = 'tests/fixtures/kg_embeddings/wn18rr_train.txt' dev_file = 'tests/fixtures/kg_embeddings/wn18rr_dev.txt' train_instances = KGTupleReader().read(train_file) reader = KGTupleReader(extra_files_for_gold_pairs=[train_file]) instances = reader.read(dev_file) self.assertEqual(len(instances), 2) vocab = Vocabulary.from_params(Params({}), train_instances + instances) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) for batch in iterator(instances, num_epochs=1, shuffle=False): pass expected_entity = [1, 5] expected_relation = ['_hypernym', '_hypernym_reverse'] expected_entity2 = [[5, 2, 3], [1, 4]] self._check_batch(batch, vocab, expected_entity, expected_relation, expected_entity2)
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def knowbert_fill2(sentences, model, batcher, vocab, mask_start=0, mask_end=0, config_file=None, top=10): iterator = DataIterator.from_params(Params({"type": "basic", "batch_size": 32})) config = Params.from_file(config_file) vocab_params = config['vocabulary'] iterator.index_with(Vocabulary.from_params(vocab_params)) instances = [] for sent in sentences: token_candidates = batcher.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sent.replace('[MASK]', ' [MASK] ')) masked_tokens = token_candidates['tokens'].copy() for i in range(mask_start, mask_end): masked_tokens[i] = '[MASK]' token_candidates['tokens'] = masked_tokens # mask out the entity candidates candidates = token_candidates['candidates'] for candidate_key in candidates.keys(): indices_to_mask = [] for k, candidate_span in enumerate(candidates[candidate_key]['candidate_spans']): if (candidate_span[0] >= mask_start and candidate_span[0] <= mask_end-1) or (candidate_span[1] >= mask_start and candidate_span[1] <= mask_end-1): indices_to_mask.append(k) for ind in indices_to_mask: candidates[candidate_key]['candidate_entities'][ind] = ['@@MASK@@'] candidates[candidate_key]['candidate_entity_priors'][ind] = [1.0] if len(indices_to_mask) == 0: candidates[candidate_key]['candidate_spans'].append([mask_start, mask_end-1]) candidates[candidate_key]['candidate_entities'].append(['@@MASK@@']) candidates[candidate_key]['candidate_entity_priors'].append([1.0]) candidates[candidate_key]['candidate_segment_ids'].append(0) fields = batcher.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates) instances.append(Instance(fields)) for batch in iterator(instances, num_epochs=1, shuffle=False): print(batch['tokens']['tokens']) model_output = model(**batch) print([vocab[w] for w in batch['tokens']['tokens'][0].numpy()]) logits, _ = model.pretraining_heads(model_output['contextual_embeddings'], model_output['pooled_output']) log_probs = F.log_softmax(logits, dim=-1).cpu() for mask_ind in range(mask_start, mask_end): topk = torch.topk(log_probs[0, mask_ind], top, -1)[1] print([vocab[t.item()] for t in topk])
def set_up_model(self, param_file, dataset_file): self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def set_up_model( self, param_file: PathLike, dataset_file: PathLike, serialization_dir: PathLike = None, seed: int = None, ): if seed is not None: random.seed(seed) numpy.random.seed(seed) torch.manual_seed(seed) self.param_file = str(param_file) params = Params.from_file(self.param_file) reader = DatasetReader.from_params( params["dataset_reader"], serialization_dir=serialization_dir ) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = list(reader.read(str(dataset_file))) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params( vocab=self.vocab, params=params["model"], serialization_dir=serialization_dir ) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def test_iterator(): indexer = StaticFasttextTokenIndexer( model_path="./data/fasttext_embedding.model", model_params_path="./data/fasttext_embedding.model.params") loader = MenionsLoader( category_mapping_file='./data/test_category_mapping.json', token_indexers={"tokens": indexer}, tokenizer=WordTokenizer(word_splitter=FastSplitter())) vocab = Vocabulary.from_params(Params({"directory_path": "./data/vocab2/"})) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) limit = 50 for _ in tqdm.tqdm(iterator(loader.read('./data/train_data_aa.tsv'), num_epochs=1), mininterval=2): limit -= 1 if limit <= 0: break
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.params.get('trainer').get('cuda_device', -1)) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model