def test_mismatched_dimensions_raise_configuration_errors(self): params = Params.from_file(self.param_file) # Make the input_dim to the first feedforward_layer wrong - it should be 2. params["model"]["attend_feedforward"]["input_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model")) params = Params.from_file(self.param_file) # Make the projection output_dim of the last layer wrong - it should be # 3, equal to the number of classes. params["model"]["aggregate_feedforward"]["output_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def create_serialization_dir(params: Params, serialization_dir: str, recover: bool) -> None: """ This function creates the serialization directory if it doesn't exist. If it already exists, then it verifies that we're recovering from a training with an identical configuration. Parameters ---------- params: ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir: ``str`` The directory in which to save results and logs. recover: ``bool`` If ``True``, we will try to recover from an existing serialization directory, and crash if the directory doesn't exist, or doesn't match the configuration we're given. """ if os.path.exists(serialization_dir): if serialization_dir == '/output': # Special-casing the beaker output directory, which will already exist when training # starts. return if not recover: raise ConfigurationError(f"Serialization directory ({serialization_dir}) already exists. " f"Specify --recover to recover training from existing output.") logger.info(f"Recovering from prior training at {serialization_dir}.") recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME) if not os.path.exists(recovered_config_file): raise ConfigurationError("The serialization directory already exists but doesn't " "contain a config.json. You probably gave the wrong directory.") else: loaded_params = Params.from_file(recovered_config_file) # Check whether any of the training configuration differs from the configuration we are # resuming. If so, warn the user that training may fail. fail = False flat_params = params.as_flat_dict() flat_loaded = loaded_params.as_flat_dict() for key in flat_params.keys() - flat_loaded.keys(): logger.error(f"Key '{key}' found in training configuration but not in the serialization " f"directory we're recovering from.") fail = True for key in flat_loaded.keys() - flat_params.keys(): logger.error(f"Key '{key}' found in the serialization directory we're recovering from " f"but not in the training config.") fail = True for key in flat_params.keys(): if flat_params.get(key, None) != flat_loaded.get(key, None): logger.error(f"Value for '{key}' in training configuration does not match that the value in " f"the serialization directory we're recovering from: " f"{flat_params[key]} != {flat_loaded[key]}") fail = True if fail: raise ConfigurationError("Training configuration does not match the configuration we're " "recovering from.") else: if recover: raise ConfigurationError(f"--recover specified but serialization_dir ({serialization_dir}) " "does not exist. There is nothing to recover from.") os.makedirs(serialization_dir)
def fine_tune_model_from_file_paths(model_archive_path: str, config_file: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False) -> Model: """ A wrapper around :func:`fine_tune_model` which loads the model archive from a file. Parameters ---------- model_archive_path : ``str`` Path to a saved model archive that is the result of running the ``train`` command. config_file : ``str`` A configuration file specifying how to continue training. The format is identical to the configuration file for the ``train`` command, but any contents in the ``model`` section is ignored (as we are using the provided model archive instead). serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`fine_tune_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`fine_tune_model`. """ # We don't need to pass in `cuda_device` here, because the trainer will call `model.cuda()` if # necessary. archive = load_archive(model_archive_path) params = Params.from_file(config_file, overrides) return fine_tune_model(model=archive.model, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging)
def train_model_from_file(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False, recover: bool = False) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- param_path : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A HOCON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) return train_model(params, serialization_dir, file_friendly_logging, recover)
def test_file_archiving(self): # This happens to be a good place to test auxiliary file archiving. # Train the model params = Params.from_file(self.FIXTURES_ROOT / 'elmo' / 'config' / 'characters_token_embedder.json') serialization_dir = os.path.join(self.TEST_DIR, 'serialization') train_model(params, serialization_dir) # Inspect the archive archive_file = os.path.join(serialization_dir, 'model.tar.gz') unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive') with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(unarchive_dir) # It should contain `files_to_archive.json` fta_file = os.path.join(unarchive_dir, 'files_to_archive.json') assert os.path.exists(fta_file) # Which should properly contain { flattened_key -> original_filename } with open(fta_file) as fta: files_to_archive = json.loads(fta.read()) assert files_to_archive == { 'model.text_field_embedder.token_embedders.elmo.options_file': str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'options.json'), 'model.text_field_embedder.token_embedders.elmo.weight_file': str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'lm_weights.hdf5'), } # Check that the unarchived contents of those files match the original contents. for key, original_filename in files_to_archive.items(): new_filename = os.path.join(unarchive_dir, "fta", key) assert filecmp.cmp(original_filename, new_filename)
def setUp(self): super(TestCopyNetReader, self).setUp() params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read('tests/fixtures/data/squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab, params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def test_load_from_file(self): filename = 'tests/fixtures/bidaf/experiment.json' params = Params.from_file(filename) assert "dataset_reader" in params assert "trainer" in params model_params = params.pop("model") assert model_params.pop("type") == "bidaf"
def find_learning_rate_from_args(args: argparse.Namespace) -> None: """ Start learning rate finder for given args """ params = Params.from_file(args.param_path, args.overrides) find_learning_rate_model(params, args.serialization_dir, start_lr=args.start_lr, end_lr=args.end_lr, num_batches=args.num_batches, linear_steps=args.linear, stopping_factor=args.stopping_factor, force=args.force)
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the phrase layer wrong - it should be 10 to match # the embedding + char cnn dimensions. params["model"]["phrase_layer"]["input_size"] = 12 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model")) params = Params.from_file(self.param_file) # Make the modeling layer input_dimension wrong - it should be 40 to match # 4 * output_dim of the phrase_layer. params["model"]["phrase_layer"]["input_size"] = 30 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model")) params = Params.from_file(self.param_file) # Make the modeling layer input_dimension wrong - it should be 70 to match # 4 * phrase_layer.output_dim + 3 * modeling_layer.output_dim. params["model"]["span_end_encoder"]["input_size"] = 50 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model"))
def train_fixture_gpu(config_prefix: str) -> None: config_file = config_prefix + 'experiment.json' serialization_dir = config_prefix + 'serialization' params = Params.from_file(config_file) params["trainer"]["cuda_device"] = 0 # train this one to a tempdir tempdir = tempfile.gettempdir() train_model(params, tempdir) # now copy back the weights and and archived model shutil.copy(os.path.join(tempdir, "best.th"), os.path.join(serialization_dir, "best_gpu.th")) shutil.copy(os.path.join(tempdir, "model.tar.gz"), os.path.join(serialization_dir, "model_gpu.tar.gz"))
def test_overrides(self): filename = 'tests/fixtures/bidaf/experiment.json' overrides = '{ "train_data_path": "FOO", "model": { "type": "BAR" },'\ 'model.text_field_embedder.tokens.type: "BAZ" }' params = Params.from_file(filename, overrides) assert "dataset_reader" in params assert "trainer" in params assert params["train_data_path"] == "FOO" model_params = params.pop("model") assert model_params.pop("type") == "BAR" assert model_params["text_field_embedder.tokens.type"] == "BAZ"
def test_forward_with_epoch_num_changes_cost_weight(self): # Redefining model. We do not want this to change the state of ``self.model``. params = Params.from_file(self.param_file) model = Model.from_params(vocab=self.vocab, params=params['model']) # Initial cost weight, before forward is called. assert model._checklist_cost_weight == 0.8 iterator = EpochTrackingBucketIterator(sorting_keys=[['sentence', 'num_tokens']]) cost_weights = [] for epoch_data in iterator(self.dataset, num_epochs=4): model.forward(**epoch_data) cost_weights.append(model._checklist_cost_weight) # The config file has ``wait_num_epochs`` set to 0, so the model starts decreasing the cost # weight at epoch 0 itself. assert_almost_equal(cost_weights, [0.72, 0.648, 0.5832, 0.52488])
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(self.vocab, params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def main(serialization_directory, device): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = config['validation_data_path'] model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device) prediction_file_path = os.path.join(serialization_directory, "predictions.txt") gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("Reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = fields["tokens"].tokens write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def test_embed_actions_works_with_batched_and_padded_input(self): params = Params.from_file(self.param_file) model = Model.from_params(vocab=self.vocab, params=params['model']) action_embedding_weights = model._action_embedder.weight rule1 = model.vocab.get_token_from_index(1, 'rule_labels') rule1_tensor = torch.LongTensor([1]) rule2 = model.vocab.get_token_from_index(2, 'rule_labels') rule2_tensor = torch.LongTensor([2]) rule3 = model.vocab.get_token_from_index(3, 'rule_labels') rule3_tensor = torch.LongTensor([3]) actions = [[(rule1, True, rule1_tensor), (rule2, True, rule2_tensor), # This one is padding; the tensors shouldn't matter here. ('', False, None)], [(rule3, True, rule3_tensor), ('instance_action', False, None), (rule1, True, rule1_tensor)]] embedded_actions, _, _, action_indices = model._embed_actions(actions) assert action_indices[(0, 0)] == action_indices[(1, 2)] assert action_indices[(1, 1)] == -1 assert len(set(action_indices.values())) == 4 # Now we'll go through all three unique actions and make sure the embedding is as we expect. action_embedding = embedded_actions[action_indices[(0, 0)]] expected_action_embedding = action_embedding_weights[action_indices[(0, 0)]] assert_almost_equal(action_embedding.cpu().data.numpy(), expected_action_embedding.cpu().data.numpy()) action_embedding = embedded_actions[action_indices[(0, 1)]] expected_action_embedding = action_embedding_weights[action_indices[(0, 1)]] assert_almost_equal(action_embedding.cpu().data.numpy(), expected_action_embedding.cpu().data.numpy()) action_embedding = embedded_actions[action_indices[(1, 0)]] expected_action_embedding = action_embedding_weights[action_indices[(1, 0)]] assert_almost_equal(action_embedding.cpu().data.numpy(), expected_action_embedding.cpu().data.numpy())
def train_model_from_file(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- parameter_filename : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) return train_model(params, serialization_dir, file_friendly_logging, recover, force, cache_directory, cache_prefix)
def load_bert_reader_model_experiment_dir(experiment_dir: str, cuda_device: int = -1): # check values of existing config config_file = os.path.join(experiment_dir, 'config.json') config = Params.from_file(config_file) # instantiate dataset reader print(config['dataset_reader']) reader = DatasetReader.from_params(config["dataset_reader"]) # instantiate model w/ pretrained weights model = Model.load( config.duplicate(), weights_file=os.path.join(experiment_dir, 'best.th'), serialization_dir=experiment_dir, cuda_device=cuda_device, ) # set training=false for prediction model.eval() return reader, model
def train_model_from_file(parameter_filename: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False) -> Model: """ A wrapper around :func:`train_model` which loads the params from a file. Parameters ---------- param_path : ``str`` A json parameter file specifying an AllenNLP experiment. serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`train_model`. overrides : ``str`` A HOCON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`train_model`. """ # Load the experiment config from a file and pass it to ``train_model``. params = Params.from_file(parameter_filename, overrides) return train_model(params, serialization_dir, file_friendly_logging)
def set_up_model(self, param_file, dataset_file): self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params["dataset_reader"]) # The dataset reader might be lazy, but a lazy list here breaks some of our tests. instances = list(reader.read(str(dataset_file))) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if "vocabulary" in params: vocab_params = params["vocabulary"] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params["model"]) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def set_up_model(self, param_file, dataset_file): self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def cache_vocab(params: Params): """ Caches the vocabulary given in the Params to the filesystem. Useful for large datasets that are run repeatedly. :param params: the AllenNLP Params """ if "vocabulary" not in params or "directory_path" not in params["vocabulary"]: return vocab_path = params["vocabulary"]["directory_path"] if os.path.exists(vocab_path): if os.listdir(vocab_path): return # Remove empty vocabulary directory to make AllenNLP happy try: os.rmdir(vocab_path) except OSError: pass params = merge_configs([params, Params.from_file(VOCAB_CONFIG_PATH)]) params["vocabulary"].pop("directory_path", None) make_vocab_from_params(params, os.path.split(vocab_path)[0])
def fine_tune_model_from_file_paths(model_archive_path: str, config_file: str, serialization_dir: str, overrides: str = "", extend_vocab: bool = False, file_friendly_logging: bool = False, batch_weight_key: str = "") -> Model: """ A wrapper around :func:`fine_tune_model` which loads the model archive from a file. Parameters ---------- model_archive_path : ``str`` Path to a saved model archive that is the result of running the ``train`` command. config_file : ``str`` A configuration file specifying how to continue training. The format is identical to the configuration file for the ``train`` command, but any contents in the ``model`` section is ignored (as we are using the provided model archive instead). serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`fine_tune_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`fine_tune_model`. """ # We don't need to pass in `cuda_device` here, because the trainer will call `model.cuda()` if # necessary. archive = load_archive(model_archive_path) params = Params.from_file(config_file, overrides) return fine_tune_model(model=archive.model, params=params, serialization_dir=serialization_dir, extend_vocab=extend_vocab, file_friendly_logging=file_friendly_logging, batch_weight_key=batch_weight_key)
def test_file_archiving(self): # This happens to be a good place to test auxiliary file archiving. # Train the model params = Params.from_file(self.FIXTURES_ROOT / "elmo" / "config" / "characters_token_embedder.json") serialization_dir = os.path.join(self.TEST_DIR, "serialization") train_model(params, serialization_dir) # Inspect the archive archive_file = os.path.join(serialization_dir, "model.tar.gz") unarchive_dir = os.path.join(self.TEST_DIR, "unarchive") with tarfile.open(archive_file, "r:gz") as archive: archive.extractall(unarchive_dir) # It should contain `files_to_archive.json` fta_file = os.path.join(unarchive_dir, "files_to_archive.json") assert os.path.exists(fta_file) # Which should properly contain { flattened_key -> original_filename } with open(fta_file) as fta: files_to_archive = json.loads(fta.read()) assert files_to_archive == { "model.text_field_embedder.token_embedders.elmo.options_file": str( pathlib.Path("allennlp") / "tests" / "fixtures" / "elmo" / "options.json"), "model.text_field_embedder.token_embedders.elmo.weight_file": str( pathlib.Path("allennlp") / "tests" / "fixtures" / "elmo" / "lm_weights.hdf5"), } # Check that the unarchived contents of those files match the original contents. for key, original_filename in files_to_archive.items(): new_filename = os.path.join(unarchive_dir, "fta", key) assert filecmp.cmp(original_filename, new_filename)
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "") -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. """ # redirect to the cache, if necessary archive_file = cached_path(archive_file) # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file %s to temp dir %s", archive_file, tempdir) with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Load config config = Params.from_file(os.path.join(tempdir, _CONFIG_NAME), overrides) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=os.path.join(tempdir, _WEIGHTS_NAME), serialization_dir=tempdir, cuda_device=cuda_device) # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)
def test_file_archiving(self): # This happens to be a good place to test auxiliary file archiving. # Train the model params = Params.from_file(self.FIXTURES_ROOT / u'elmo' / u'config' / u'characters_token_embedder.json') serialization_dir = os.path.join(self.TEST_DIR, u'serialization') train_model(params, serialization_dir) # Inspect the archive archive_file = os.path.join(serialization_dir, u'model.tar.gz') unarchive_dir = os.path.join(self.TEST_DIR, u'unarchive') with tarfile.open(archive_file, u'r:gz') as archive: archive.extractall(unarchive_dir) # It should contain `files_to_archive.json` fta_file = os.path.join(unarchive_dir, u'files_to_archive.json') assert os.path.exists(fta_file) # Which should properly contain { flattened_key -> original_filename } with open(fta_file) as fta: files_to_archive = json.loads(fta.read()) assert files_to_archive == { u'model.text_field_embedder.token_embedders.elmo.options_file': unicode( pathlib.Path(u'allennlp') / u'tests' / u'fixtures' / u'elmo' / u'options.json'), u'model.text_field_embedder.token_embedders.elmo.weight_file': unicode( pathlib.Path(u'allennlp') / u'tests' / u'fixtures' / u'elmo' / u'lm_weights.hdf5'), } # Check that the unarchived contents of those files match the original contents. for key, original_filename in list(files_to_archive.items()): new_filename = os.path.join(unarchive_dir, u"fta", key) assert filecmp.cmp(original_filename, new_filename)
def run_uncertainty_experiment_from_file( experiment_filename: PathLike, serialization_dir: Optional[Union[str, PathLike]] = None, recover: Optional[bool] = False, force: Optional[bool] = False, train_only: Optional[bool] = False): """ A wrapper around `run_uncertainty_experiment` which loads the params from a file. """ experiment_name = os.path.splitext( os.path.basename(experiment_filename))[0] if not serialization_dir: serialization_dir = os.path.join(Config.serialization_base_dir, experiment_name) params = Params.from_file(experiment_filename) run_uncertainty_experiment(params=params, name=experiment_name, serialization_dir=serialization_dir, recover=recover, force=force, train_only=train_only)
def test_mapper_iris(self, iris_dataframe) -> None: params = Params.from_file(self.FIXTURES_ROOT / "mapper" / "mapper_iris.jsonnet") mapper = DataFrameMapper.from_params(params=params) features = mapper.features assert features[0] == (["sepal length (cm)"], None, {}) assert features[1] == (["sepal width (cm)"], None, {}) assert features[2] == (["petal length (cm)"], None, {}) assert features[3] == (["petal width (cm)"], None, {}) assert features[4][0] == ["species"] assert isinstance(features[4][1][0], FlattenTransformer) assert isinstance(features[4][1][1], LabelEncoder) assert features[4][2] == {} mapper.fit_transform(iris_dataframe) assert mapper.transformed_names_ == [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)", "species", ]
def test_load_uds_to_ud(setup): # load ud states uds_checkpoint_path = os.path.join(test_path, "checkpoints", "overfit_interface_encoder_side.ckpt", "best.th") uds_state_dict = torch.load(uds_checkpoint_path) # create a ud model decomp_checkpoint_path = os.path.join(test_path, "checkpoints", "overfit_syntax_only.ckpt") config_path = os.path.join(test_path, "configs", "load_weights_uds_to_ud.jsonnet") params = Params.from_file(config_path) params["model"]["pretrained_weights"] = uds_checkpoint_path model = Model.load(params, decomp_checkpoint_path) model.load_partial(uds_checkpoint_path) # compare state dicts key = "biaffine_parser.edge_type_query_linear.weight" loaded_weights = model.state_dict()[key].data.numpy() saved_weights = uds_state_dict[key].data.numpy() # assert same assert (np.abs(np.sum(loaded_weights - saved_weights)) < TOL)
def ensure_model_can_train_save_and_load(self, param_file: str): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].numpy(), loaded_model.state_dict()[key].numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) iterator = DataIterator.from_params(params['iterator']) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) model_dataset.index_instances(model.vocab) model_batch_arrays = next(iterator(model_dataset, shuffle=False)) model_batch = arrays_to_variables(model_batch_arrays, for_training=False) loaded_dataset = reader.read(params['validation_data_path']) loaded_dataset.index_instances(loaded_model.vocab) loaded_batch_arrays = next(iterator(loaded_dataset, shuffle=False)) loaded_batch = arrays_to_variables(loaded_batch_arrays, for_training=False) # The datasets themselves should be identical. for key in model_batch.keys(): field = model_batch[key] if isinstance(field, dict): for subfield in field: self.assert_fields_equal(model_batch[key][subfield], loaded_batch[key][subfield], tolerance=1e-6, name=key + '.' + subfield) else: self.assert_fields_equal(model_batch[key], loaded_batch[key], 1e-6, key) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model.forward(**model_batch) loaded_model_predictions = loaded_model.forward(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], tolerance=1e-4, name=key) return model, loaded_model
def ensure_model_can_train_save_and_load( self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = "", metric_to_check: str = None, metric_terminal_value: float = None, metric_tolerance: float = 1e-4, disable_dropout: bool = True, ): """ # Parameters param_file : `str` Path to a training configuration file that we will use to train the model for this test. tolerance : `float`, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as `rtol` to `numpy.testing.assert_allclose`). cuda_device : `int`, optional (default=-1) The device to run the test on. gradients_to_ignore : `Set[str]`, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : `str`, optional (default = "") A JSON string that we will use to override values in the input parameter file. metric_to_check: `str`, optional (default = None) We may want to automatically perform a check that model reaches given metric when training (on validation set, if it is specified). It may be useful in CI, for example. You can pass any metric that is in your model returned metrics. metric_terminal_value: `str`, optional (default = None) When you set `metric_to_check`, you need to set the value this metric must converge to metric_tolerance: `float`, optional (default=1e-4) Tolerance to check you model metric against metric terminal value. One can expect some variance in model metrics when the training process is highly stochastic. disable_dropout : `bool`, optional (default = True) If True we will set all dropout to 0 before checking gradients. (Otherwise, with small datasets, you may get zero gradients because of unlucky dropout.) """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) metrics_file = save_dir / "metrics.json" if metric_to_check is not None: metrics = json.loads(metrics_file.read_text()) metric_value = metrics.get( f"best_validation_{metric_to_check}") or metrics.get( f"training_{metric_to_check}") assert metric_value is not None, f"Cannot find {metric_to_check} in metrics.json file" assert metric_terminal_value is not None, "Please specify metric terminal value" assert abs(metric_value - metric_terminal_value) < metric_tolerance loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose( model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key, ) params = Params.from_file(param_file, params_overrides=overrides) reader = DatasetReader.from_params(params["dataset_reader"]) print("Reading with original model") model_dataset = reader.read(params["validation_data_path"]) model_dataset.index_with(model.vocab) print("Reading with loaded model") loaded_dataset = reader.read(params["validation_data_path"]) loaded_dataset.index_with(loaded_model.vocab) # Need to duplicate params because DataLoader.from_params will consume. data_loader_params = params["data_loader"] data_loader_params["shuffle"] = False data_loader_params2 = Params( copy.deepcopy(data_loader_params.as_dict())) data_loader = DataLoader.from_params(dataset=model_dataset, params=data_loader_params) data_loader2 = DataLoader.from_params(dataset=loaded_dataset, params=data_loader_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_batch = next(iter(data_loader)) loaded_batch = next(iter(data_loader2)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore, disable_dropout) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, "stateful") and module.stateful: module.reset_states() print("Predicting with original model") model_predictions = model(**model_batch) print("Predicting with loaded model") loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def _load_archive(archive_file: str, adapters_dir: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None): """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}") if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}") with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, "files_to_archive.json") if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, original_filename in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") if os.path.exists(replacement_filename): replacements_dict[key] = replacement_filename else: logger.warning(f"Archived file {replacement_filename} not found! At train time " f"this file was located at {original_filename}. This may be " "because you are loading a serialization directory. Attempting to " "load the file from its train-time location.") overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=overrides_dict, fallback=unflatten(replacements_dict)) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, "config.json"), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, "weights.th") # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, "best.th") # Instantiate model. Use a duplicate of the config, as it will get consumed. model = _load(config.duplicate(), adapters_dir=adapters_dir, weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) return Archive(model=model, config=config)
trainer.train() # Now tar up results archive_model(serialization_dir) return model if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( 'param_path', type=str, help='path to parameter file describing the model to be trained') parser.add_argument("logdir", type=str) parser.add_argument("--filtering", type=str, default=None) parser.add_argument("--cuda-device", type=int, default=None, help='id of GPU to use (if any)') args = parser.parse_args() params = Params.from_file(args.param_path) train_model(params, args.cuda_device, args.logdir, args.filtering)
from allennlp.data.iterators import DataIterator from allennlp.commands.evaluate import evaluate from allennlp.data.dataset_readers import DatasetReader from allennlp.models.archival import load_archive from typing import Any, Dict, List, Optional, Tuple from allennlp.nn import InitializerApplicator, util, RegularizerApplicator @Model.register('Seq2IdxSumInit') class Seq2IdxSum(Model): def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ) -> None: super(Seq2IdxSum, self).__init__(vocab, regularizer) if __name__ == '__main__': root = "/scratch/cluster/jcxu/exComp/" dataset_dir = "/scratch/cluster/jcxu/data/SegSum/abc/" jsonnet_file = os.path.join(root, 'neusum/remade/model.jsonnet') params = Params.from_file(jsonnet_file) serialization_dir = tempfile.mkdtemp(prefix=os.path.join(root, 'tmp_exps')) model = train_model(params, serialization_dir)
configs = [] if not args.resume: serialization_dir = os.path.join( "logs", log_dir_name, datetime.datetime.now().strftime("%Y.%m.%d_%H.%M.%S")) overrides = {} if args.device is not None: overrides["trainer"] = {"cuda_device": args.device} if args.lazy is not None: overrides["dataset_reader"] = {"lazy": args.lazy} configs.append(Params(overrides)) for config_file in args.config: configs.append(Params.from_file(config_file)) configs.append(Params.from_file(args.base_config)) else: serialization_dir = args.resume configs.append( Params.from_file(os.path.join(serialization_dir, "config.json"))) train_params = util.merge_configs(configs) if "vocabulary" in train_params: # Remove this key to make AllenNLP happy train_params["vocabulary"].pop("non_padded_namespaces", None) predict_params = train_params.duplicate() import_submodules("udify")
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = ""): """ Parameters ---------- param_file : ``str`` Path to a training configuration file that we will use to train the model for this test. tolerance : ``float``, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as ``rtol`` to ``numpy.testing.assert_allclose``). cuda_device : ``int``, optional (default=-1) The device to run the test on. gradients_to_ignore : ``Set[str]``, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : ``str``, optional (default = "") A JSON string that we will use to override values in the input parameter file. """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") HOCON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary archive_file = cached_path(archive_file) tempdir = None if os.path.isdir(archive_file): serialization_dir = archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file %s to temp dir %s", archive_file, tempdir) with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacement_hocon = pyhocon.ConfigTree(root=True) for key, _ in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") replacement_hocon.put(key, replacement_filename) overrides_hocon = pyhocon.ConfigFactory.parse_string(overrides) combined_hocon = replacement_hocon.with_fallback(overrides_hocon) overrides = json.dumps(combined_hocon) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)
def ensure_model_can_train_save_and_load( self, param_file: Union[PathLike, str], tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = "", metric_to_check: str = None, metric_terminal_value: float = None, metric_tolerance: float = 1e-4, disable_dropout: bool = True, ): save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) metrics_file = save_dir / "metrics.json" if metric_to_check is not None: metric_value = metrics.get(f"best_validation_{metric_to_check}") or metrics.get( f"training_{metric_to_check}" ) assert metric_value is not None, f"Cannot find {metric_to_check} in metrics.json file" assert metric_terminal_value is not None, "Please specify metric terminal value" assert abs(metric_value - metric_terminal_value) < metric_tolerance loaded_model = load_archive(archive_file, cuda_device=cuda_device).model assert state_keys == loaded_state_keys for key in state_keys: assert_allclose( model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key, ) params = Params.from_file(param_file, params_overrides=overrides) reader = DatasetReader.from_params(params["dataset_reader"]) print("Reading with original model") model_dataset = reader.read(params["validation_data_path"]) print("Reading with loaded model") loaded_dataset = reader.read(params["validation_data_path"]) data_loader_params = params["data_loader"] data_loader_params["shuffle"] = False data_loader_params2 = Params(copy.deepcopy(data_loader_params.as_dict())) data_loader2 = DataLoader.from_params(dataset=loaded_dataset, params=data_loader_params2) model_batch = next(iter(data_loader)) loaded_batch = next(iter(data_loader2)) self.check_model_computes_gradients_correctly( model, model_batch, gradients_to_ignore, disable_dropout ) for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, "stateful") and module.stateful: module.reset_states() print("Predicting with original model") model_predictions = model(**model_batch) print("Predicting with loaded model") loaded_model_predictions = loaded_model(**loaded_batch) self.assert_fields_equal( model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance )
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
LogHelper.get_logger("allennlp.training.trainer") LogHelper.get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument('db', type=str, help='/path/to/saved/db.db') parser.add_argument( 'param_path', type=str, help='path to parameter file describing the model to be trained') parser.add_argument("logdir", type=str) parser.add_argument("--filtering", type=str, default=None) parser.add_argument("--cuda-device", type=int, default=None, help='id of GPU to use (if any)') parser.add_argument( '-o', '--overrides', type=str, default="", help='a HOCON structure used to override the experiment configuration') args = parser.parse_args() db = FeverDocDB(args.db) params = Params.from_file(args.param_path, args.overrides) train_model(db, params, args.cuda_device, args.logdir, args.filtering)
def merge_configs(params_config_path, datasets_config_path): params_config = Params.from_file(params_config_path) datasets_config = Params.from_file(datasets_config_path) # to support reading from multiple files we add them to the datasetreader constructor instead # the following ones are there just here to make allennlp happy params_config['train_data_path'] = 'TRAINPLACEHOLDER' params_config['validation_data_path'] = 'DEVPLACEHOLDER' params_config['dataset_reader']['datasets'] = datasets_config.as_dict() ordered_stuff = {} new_decoders = {} for dataset in datasets_config: for task in datasets_config[dataset]['tasks']: # start out with default decoder task_decoder = copy.deepcopy( params_config['model']['decoders']['default'].as_dict()) # add task_type defaults task_type = datasets_config[dataset]['tasks'][task]['task_type'] if task_type not in params_config['model']['decoders']: tasks_list = [ task_str for task_str in params_config['modeĺ']['decoders'] ] del tasks_list['default'] logger.error('Task type ' + task_type + " is not supported, please use one of " + str(tasks_list)) task_decoder.update( params_config['model']['decoders'][task_type].as_dict()) # add anything that is defined in dataset_config task_decoder.update( datasets_config[dataset]['tasks'][task].as_dict()) # add name of task to task itself (used to log metrics) task_decoder['task'] = task # Used to create an ordered list later ordered_stuff[task] = [task_decoder['order'], task_type] # remove items only used in datareader, and items save in ordered_stuff for item in ['column_idx', 'task_type', 'order']: if item in task_decoder: del task_decoder[item] new_decoders[task] = task_decoder if 'max_sents' not in datasets_config[ dataset] and params_config['model']['default_max_sents'] != 0: params_config['dataset_reader']['datasets'][dataset][ 'max_sents'] = params_config['model']['default_max_sents'] if 'default_max_sents' in params_config['model']: del params_config['model']['default_max_sents'] params_config['model']['decoders'] = new_decoders # Used in the machamp model to decide which order to use # generate ordered lists, which make it easier to use in the machamp model ordered_tasks = [] ordered_task_types = [] no_padding = [] for label, idx in sorted(ordered_stuff.items(), key=lambda item: item[1]): ordered_tasks.append(label) ordered_task_types.append(ordered_stuff[label][1]) if ordered_stuff[label][1] == 'dependency': no_padding.append(label + '_rels') no_padding.append(label + '_head_indices') else: no_padding.append(label) #TODO, might want to add seq2seq here as well? params_config['model']['tasks'] = ordered_tasks params_config['model']['task_types'] = ordered_task_types params_config['vocabulary'] = {'non_padded_namespaces': ['dataset']} #params_config['vocabulary'] = {'non_padded_namespaces': no_padding + ['dataset', 'src_tokens']} return params_config
def main(serialization_directory: str, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file( os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config["dataset_reader"]) evaluation_data_path = data if data else config["validation_data_path"] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) dataset = list(dataset_reader.read(evaluation_data_path)) with torch.autograd.no_grad(): loader = SimpleDataLoader(dataset, 32) model_predictions: List[List[str]] = [] for batch in Tqdm.tqdm(loader): batch = move_to_device(batch, device) result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(dataset, model_predictions): fields = instance.fields verb_index = fields["metadata"]["verb_index"] gold_tags = fields["metadata"]["gold_tags"] sentence = fields["metadata"]["words"] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def test_model_load(self): params = Params.from_file('tests/fixtures/decomposable_attention/experiment.json') model = Model.load(params, serialization_dir='tests/fixtures/decomposable_attention/serialization') assert isinstance(model, DecomposableAttention)
targets = [] for target_sequence in instance.fields[ 'target_action_sequences'].field_list: targets.append([]) for target_index_field in target_sequence.field_list: targets[-1].append( action_map[target_index_field.sequence_index]) json_obj['target_action_sequences'] = targets json_obj['example_lisp_string'] = instance.fields[ 'example_lisp_string'].metadata entity_texts = [] for entity_text in instance.fields['table'].entity_texts: tokens = [{ 'text': token.text, 'lemma': token.lemma_ } for token in entity_text] entity_texts.append(tokens) json_obj['entity_texts'] = entity_texts json_obj['linking_features'] = instance.fields['table'].linking_features return json.dumps(json_obj) if __name__ == '__main__': param_file = sys.argv[1] outdir = 'wikitables_preprocessed_data/' params = Params.from_file(param_file) main(params, outdir)
def ensure_model_can_train_save_and_load( self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = ""): """ Parameters ---------- param_file : ``str`` Path to a training configuration file that we will use to train the model for this test. tolerance : ``float``, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as ``rtol`` to ``numpy.testing.assert_allclose``). cuda_device : ``int``, optional (default=-1) The device to run the test on. gradients_to_ignore : ``Set[str]``, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : ``str``, optional (default = "") A JSON string that we will use to override values in the input parameter file. """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) # Need to duplicate params because DatasetReader.from_params will consume. reader_params = params['dataset_reader'] reader_params2 = Params(copy.deepcopy(reader_params.as_dict())) reader = DatasetReader.from_params(reader_params) reader2 = DatasetReader.from_params(reader_params2) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. seed_params = Params({ "random_seed": 5, "numpy_seed": 5, "pytorch_seed": 5 }) prepare_environment(seed_params) model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) seed_params = Params({ "random_seed": 5, "numpy_seed": 5, "pytorch_seed": 5 }) prepare_environment(seed_params) loaded_dataset = reader2.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() # import pdb; pdb.set_trace() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
cuda_device = -1 library = 'library' # files=['./Final Task/Test/SemEval2017-task3-English-test-input.xml'] files = ['./Final Task/dev/SemEval2016-Task3-CQA-QL-dev.xml'] attn = "_cos_hyper" calculate_map = True write_file = False Wfile_name = "out.txt" import_submodules(library) model_config = "config/%s_eval.jsonnet" % model_name overrides = overrides = json.dumps( {"trainer": { "cuda_device": cuda_device }}) params = Params.from_file(model_config, overrides) model_file = 'checkpoint/%s%s/' % (model_name, attn) iterator = DataIterator.from_params(params.pop("iterator")) torch.manual_seed(0) numpy.random.seed(0) if write_file: wf = Write_outfile(Wfile_name) print("Loading vocabulary") vocab = Vocabulary.from_files(model_file + 'vocabulary') print('Initialing model') model = Model.from_params(vocab=vocab, params=params.pop('model')) print("Loading Model file from %s" % (model_file + 'best.th'))
def create_serialization_dir(params: Params, serialization_dir: str, recover: bool) -> None: """ This function creates the serialization directory if it doesn't exist. If it already exists and is non-empty, then it verifies that we're recovering from a training with an identical configuration. Parameters ---------- params: ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir: ``str`` The directory in which to save results and logs. recover: ``bool`` If ``True``, we will try to recover from an existing serialization directory, and crash if the directory doesn't exist, or doesn't match the configuration we're given. """ if os.path.exists(serialization_dir) and os.listdir(serialization_dir): if not recover: raise ConfigurationError( f"Serialization directory ({serialization_dir}) already exists and is " f"not empty. Specify --recover to recover training from existing output." ) logger.info(f"Recovering from prior training at {serialization_dir}.") recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME) if not os.path.exists(recovered_config_file): raise ConfigurationError( "The serialization directory already exists but doesn't " "contain a config.json. You probably gave the wrong directory." ) else: loaded_params = Params.from_file(recovered_config_file) # Check whether any of the training configuration differs from the configuration we are # resuming. If so, warn the user that training may fail. fail = False flat_params = params.as_flat_dict() flat_loaded = loaded_params.as_flat_dict() for key in flat_params.keys() - flat_loaded.keys(): logger.error( f"Key '{key}' found in training configuration but not in the serialization " f"directory we're recovering from.") fail = True for key in flat_loaded.keys() - flat_params.keys(): logger.error( f"Key '{key}' found in the serialization directory we're recovering from " f"but not in the training config.") fail = True for key in flat_params.keys(): if flat_params.get(key, None) != flat_loaded.get(key, None): logger.error( f"Value for '{key}' in training configuration does not match that the value in " f"the serialization directory we're recovering from: " f"{flat_params[key]} != {flat_loaded[key]}") fail = True if fail: raise ConfigurationError( "Training configuration does not match the configuration we're " "recovering from.") else: if recover: raise ConfigurationError( f"--recover specified but serialization_dir ({serialization_dir}) " "does not exist. There is nothing to recover from.") os.makedirs(serialization_dir, exist_ok=True)
def test_model_load(self): params = Params.from_file(self.FIXTURES_ROOT / 'decomposable_attention' / 'experiment.json') model = Model.load(params, serialization_dir=self.FIXTURES_ROOT / 'decomposable_attention' / 'serialization') assert isinstance(model, DecomposableAttention)
def main(serialization_directory: int, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = data if data else config['validation_data_path'] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) with torch.autograd.no_grad(): iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = [x.text for x in fields["tokens"].tokens] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
import_submodules(args.include_package) archive_dir = Path(args.archive) config_file = archive_dir / "config.json" overrides = { "dataset_reader": { "read_dependencies": False, "max_len": 10000 }, "validation_dataset_reader": { "read_dependencies": False, "max_len": 10000, } } configs = [Params(overrides), Params.from_file(config_file)] params = util.merge_configs(configs) if params["model"]["type"] == "from_archive": model_config_file = str( Path(params["model"]["archive_file"]).parent.joinpath("config.json")) model_config = Params.from_file(model_config_file)["model"] params['model'] = model_config.as_dict(quiet=True) try: lm_name = f"_{os.environ['LM']}" if os.environ.get("LM") else "" if os.environ["SHIFT"] == "1": params['model'][ "ft_lang_mean_dir"] = f"ckpts/{os.environ['FT_LANG']}_mean{lm_name}" except (AttributeError, KeyError) as e: pass try:
for token in question_tokens] json_obj['table_lines'] = instance.fields['table_metadata'].metadata action_map = {i: action.rule for i, action in enumerate(instance.fields['actions'].field_list)} if 'target_action_sequences' in instance.fields: targets = [] for target_sequence in instance.fields['target_action_sequences'].field_list: targets.append([]) for target_index_field in target_sequence.field_list: targets[-1].append(action_map[target_index_field.sequence_index]) json_obj['target_action_sequences'] = targets json_obj['example_lisp_string'] = instance.fields['example_lisp_string'].metadata entity_texts = [] for entity_text in instance.fields['table'].entity_texts: tokens = [{'text': token.text, 'lemma': token.lemma_} for token in entity_text] entity_texts.append(tokens) json_obj['entity_texts'] = entity_texts json_obj['linking_features'] = instance.fields['table'].linking_features return json.dumps(json_obj) if __name__ == '__main__': param_file = sys.argv[1] outdir = 'wikitables_preprocessed_data/' params = Params.from_file(param_file) main(params, outdir)