def test_extra_files(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = { "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') } archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(serialization_dir / 'model.tar.gz') params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get('train_data_path').endswith('/fta/train_data_path') # The validation data path should be the same though. assert params.get('validation_data_path') == str( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
def test_error_is_throw_when_cuda_device_is_not_available(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "cuda_device": torch.cuda.device_count(), "optimizer": "adam" } }) with pytest.raises(ConfigurationError, message="Experiment specified a GPU but none is available;" " if you want to run on CPU use the override" " 'trainer.cuda_device=-1' in the json config file."): train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
def test_train_with_test_set(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "lazy-test"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "test_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "evaluate_on_test": True, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'lazy_test_set'))
def test_train_nograd_regex(self): params_get = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) serialization_dir = os.path.join(self.TEST_DIR, 'test_train_nograd') regex_lists = [[], [".*text_field_embedder.*"], [".*text_field_embedder.*", ".*encoder.*"]] for regex_list in regex_lists: params = params_get() params["trainer"]["no_grad"] = regex_list shutil.rmtree(serialization_dir, ignore_errors=True) model = train_model(params, serialization_dir=serialization_dir) # If regex is matched, parameter name should have requires_grad False # Or else True for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad # If all parameters have requires_grad=False, then error. params = params_get() params["trainer"]["no_grad"] = ["*"] shutil.rmtree(serialization_dir, ignore_errors=True) with pytest.raises(Exception) as _: model = train_model(params, serialization_dir=serialization_dir)
def test_archiving(self): # copy params, since they'll get consumed during training params_copy = copy.deepcopy(self.params.as_dict()) # `train_model` should create an archive serialization_dir = self.TEST_DIR / 'archive_test' model = train_model(self.params, serialization_dir=serialization_dir) archive_path = serialization_dir / "model.tar.gz" # load from the archive archive = load_archive(archive_path) model2 = archive.model # check that model weights are the same keys = set(model.state_dict().keys()) keys2 = set(model2.state_dict().keys()) assert keys == keys2 for key in keys: assert torch.equal(model.state_dict()[key], model2.state_dict()[key]) # check that vocabularies are the same vocab = model.vocab vocab2 = model2.vocab assert vocab._token_to_index == vocab2._token_to_index # pylint: disable=protected-access assert vocab._index_to_token == vocab2._index_to_token # pylint: disable=protected-access # check that params are the same params2 = archive.config assert params2.as_dict() == params_copy
def test_train_model(self): params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model')) # It's OK if serialization dir exists but is empty: serialization_dir2 = os.path.join(self.TEST_DIR, 'empty_directory') assert not os.path.exists(serialization_dir2) os.makedirs(serialization_dir2) train_model(params(), serialization_dir=serialization_dir2) # It's not OK if serialization dir exists and has junk in it non-empty: serialization_dir3 = os.path.join(self.TEST_DIR, 'non_empty_directory') assert not os.path.exists(serialization_dir3) os.makedirs(serialization_dir3) with open(os.path.join(serialization_dir3, 'README.md'), 'w') as f: f.write("TEST") with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=serialization_dir3) # It's also not OK if serialization dir is a real serialization dir: with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
def test_file_archiving(self): # This happens to be a good place to test auxiliary file archiving. # Train the model params = Params.from_file(self.FIXTURES_ROOT / 'elmo' / 'config' / 'characters_token_embedder.json') serialization_dir = os.path.join(self.TEST_DIR, 'serialization') train_model(params, serialization_dir) # Inspect the archive archive_file = os.path.join(serialization_dir, 'model.tar.gz') unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive') with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(unarchive_dir) # It should contain `files_to_archive.json` fta_file = os.path.join(unarchive_dir, 'files_to_archive.json') assert os.path.exists(fta_file) # Which should properly contain { flattened_key -> original_filename } with open(fta_file) as fta: files_to_archive = json.loads(fta.read()) assert files_to_archive == { 'model.text_field_embedder.token_embedders.elmo.options_file': str( pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'options.json'), 'model.text_field_embedder.token_embedders.elmo.weight_file': str( pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'lm_weights.hdf5'), } # Check that the unarchived contents of those files match the original contents. for key, original_filename in files_to_archive.items(): new_filename = os.path.join(unarchive_dir, "fta", key) assert filecmp.cmp(original_filename, new_filename)