def test_train_model(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "stacked_encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) train_model(params, serialization_dir=self.TEST_DIR)
def train_model_from_args(args: argparse.Namespace): params = Params.from_file(args.param_path, args.overrides) params_dict = params.as_flat_dict() params_dict.update({"args": vars(args)}) flattened_params = flatten_dict_for_mlflow_log(params_dict) with mlflow.start_run(): mlflow.log_params(flattened_params) serialization_dir = get_serialization_dir(args) try: train_model( params=params, serialization_dir=serialization_dir, file_friendly_logging=args.file_friendly_logging, recover=args.recover, force=args.force, node_rank=args.node_rank, include_package=args.include_package, dry_run=args.dry_run, ) finally: if not args.dry_run: mlflow.log_artifacts(serialization_dir)
def test_train_with_test_set(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "lazy-test"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "test_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "evaluate_on_test": True, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'lazy_test_set'))
def train_func(config, reporter): logger.debug( f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}") for package_name in getattr(args, "include_package", ()): import_submodules(package_name) search_space = HyperparameterSearch(**config) sample = search_space.sample() for k, v in sample.items(): config[k] = str(v) params_dict = json.loads( _jsonnet.evaluate_snippet("config", parameter_file_snippet, tla_codes={}, ext_vars=config)) if args.num_gpus == 0: logger.warning(f"No GPU specified, using CPU.") params_dict["trainer"]["cuda_device"] = -1 if args.cpus_per_trial > 0: torch.set_num_threads(args.cpus_per_trial) params = Params(params_dict) logger.debug(f"AllenNLP Configuration: {params.as_dict()}") train_model(params=params, serialization_dir="trial") reporter(done=True)
def cmd_train( source: pathlib.Path, destination: pathlib.Path, can_overwrite: bool, ): # ========================= # Prepare the output folder # ========================= source_path_name = pathlib.Path(source).name dest_folder_root: pathlib.Path if destination: dest_folder_root = destination # TODO: check if it's empty else: dest_folder_root = ct.create_folder_time(f"ml_{source_path_name}_", to_make=True) # === END IF === import os os.chdir(source) import allennlp.common.params as allp import allennlp.common.util as allu import allennlp.commands.train as allct allu.import_submodules("depccg.models.my_allennlp") allct.train_model(params=allp.Params.from_file( core.FILES["trainer_settings"]), serialization_dir=dest_folder_root) # === END ===
def test_train_model(self): params = lambda: Params({ "model": { "type": "constant" }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "data_loader": { "batch_size": 2 }, "trainer": { "type": "no_op" }, }) serialization_dir = self.TEST_DIR / "serialization_directory" train_model(params(), serialization_dir=serialization_dir) archive = load_archive(str(serialization_dir / "model.tar.gz")) model = archive.model assert model.forward(torch.tensor([1, 2, 3]))["class"] == torch.tensor(98) assert model.vocab.get_vocab_size() == 9
def test_file_archiving(self): # This happens to be a good place to test auxiliary file archiving. # Train the model params = Params.from_file('tests/fixtures/elmo/config/characters_token_embedder.json') serialization_dir = os.path.join(self.TEST_DIR, 'serialization') train_model(params, serialization_dir) # Inspect the archive archive_file = os.path.join(serialization_dir, 'model.tar.gz') unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive') with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(unarchive_dir) # It should contain `files_to_archive.json` fta_file = os.path.join(unarchive_dir, 'files_to_archive.json') assert os.path.exists(fta_file) # Which should properly contain { hocon_key -> original_filename } with open(fta_file) as fta: files_to_archive = json.loads(fta.read()) assert files_to_archive == { 'model.text_field_embedder.elmo.options_file': 'tests/fixtures/elmo/options.json', 'model.text_field_embedder.elmo.weight_file': 'tests/fixtures/elmo/lm_weights.hdf5' } # Check that the unarchived contents of those files match the original contents. for key, original_filename in files_to_archive.items(): new_filename = os.path.join(unarchive_dir, "fta", key) assert filecmp.cmp(original_filename, new_filename)
def test_file_archiving(self): # This happens to be a good place to test auxiliary file archiving. # Train the model params = Params.from_file(self.FIXTURES_ROOT / 'elmo' / 'config' / 'characters_token_embedder.json') serialization_dir = os.path.join(self.TEST_DIR, 'serialization') train_model(params, serialization_dir) # Inspect the archive archive_file = os.path.join(serialization_dir, 'model.tar.gz') unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive') with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(unarchive_dir) # It should contain `files_to_archive.json` fta_file = os.path.join(unarchive_dir, 'files_to_archive.json') assert os.path.exists(fta_file) # Which should properly contain { flattened_key -> original_filename } with open(fta_file) as fta: files_to_archive = json.loads(fta.read()) assert files_to_archive == { 'model.text_field_embedder.token_embedders.elmo.options_file': str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'options.json'), 'model.text_field_embedder.token_embedders.elmo.weight_file': str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'lm_weights.hdf5'), } # Check that the unarchived contents of those files match the original contents. for key, original_filename in files_to_archive.items(): new_filename = os.path.join(unarchive_dir, "fta", key) assert filecmp.cmp(original_filename, new_filename)
def test_fine_tune_nograd_regex(self): original_model = load_archive(self.model_archive).model name_parameters_original = dict(original_model.named_parameters()) regex_lists = [ [], [".*attend_feedforward.*", ".*token_embedder.*"], [".*compare_feedforward.*"], ] for regex_list in regex_lists: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = regex_list shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = train_model( model=original_model, params=params, serialization_dir=self.serialization_dir ) # If regex is matched, parameter name should have requires_grad False # If regex is matched, parameter name should have same requires_grad # as the originally loaded model for name, parameter in tuned_model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad == name_parameters_original[name].requires_grad # If all parameters have requires_grad=False, then error. with pytest.raises(Exception) as _: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = ["*"] shutil.rmtree(self.serialization_dir, ignore_errors=True) train_model( model=original_model, params=params, serialization_dir=self.serialization_dir )
def test_error_is_throw_when_cuda_device_is_not_available(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "cuda_device": torch.cuda.device_count(), "optimizer": "adam" } }) with pytest.raises(ConfigurationError, message="Experiment specified a GPU but none is available;" " if you want to run on CPU use the override" " 'trainer.cuda_device=-1' in the json config file."): train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
def test_dry_run_makes_vocab(self): vocab_path = self.TEST_DIR / "vocabulary" train_model(self.params, self.TEST_DIR, dry_run=True) vocab_files = os.listdir(vocab_path) assert set(vocab_files) == { ".lock", "labels.txt", "non_padded_namespaces.txt", "tokens.txt", } with open(vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] tokens.sort() assert tokens == [ ".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs", "snakes" ] with open(vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def test_error_is_throw_when_cuda_device_is_not_available(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "cuda_device": torch.cuda.device_count(), "optimizer": "adam" } }) with pytest.raises(ConfigurationError, match="Experiment specified"): train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
def test_train_model_distributed(self): params = lambda: Params( { "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "iterator": {"type": "basic", "batch_size": 2}, "trainer": {"num_epochs": 2, "optimizer": "adam"}, "distributed": {"cuda_devices": [0, 1]}, } ) out_dir = os.path.join(self.TEST_DIR, "test_distributed_train") train_model(params(), serialization_dir=out_dir) # Check that some logs specific to distributed # training are where we expect. serialized_files = os.listdir(out_dir) assert "stderr_worker0.log" in serialized_files assert "stdout_worker0.log" in serialized_files assert "stderr_worker1.log" in serialized_files assert "stdout_worker1.log" in serialized_files assert "model.tar.gz" in serialized_files # Check we can load the seralized model assert load_archive(out_dir).model
def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / "existing" extended_serialization_dir = self.TEST_DIR / "extended" existing_vocab_path = existing_serialization_dir / "vocabulary" extended_vocab_path = extended_serialization_dir / "vocabulary" vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens") vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens") vocab.add_token_to_namespace("N", namespace="labels") vocab.add_token_to_namespace("V", namespace="labels") os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params["vocabulary"] = {} self.params["vocabulary"]["type"] = "from_files" self.params["vocabulary"]["directory"] = str(existing_vocab_path) train_model(self.params, extended_serialization_dir, dry_run=True) with open(extended_vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] assert tokens[0] == "@@UNKNOWN@@" assert tokens[1] == "some_weird_token_1" assert tokens[2] == "some_weird_token_2" assert len(tokens) == 3
def test_train_model(self): params = Params({ u"model": { u"type": u"simple_tagger", u"text_field_embedder": { u"tokens": { u"type": u"embedding", u"embedding_dim": 5 } }, u"encoder": { u"type": u"lstm", u"input_size": 5, u"hidden_size": 7, u"num_layers": 2 } }, u"dataset_reader": { u"type": u"lazy-test" }, u"train_data_path": SEQUENCE_TAGGING_DATA_PATH, u"validation_data_path": SEQUENCE_TAGGING_DATA_PATH, u"iterator": { u"type": u"basic", u"batch_size": 2 }, u"trainer": { u"num_epochs": 2, u"optimizer": u"adam" } }) train_model(params, serialization_dir=os.path.join(self.TEST_DIR, u'train_lazy_model'))
def test_train_saves_all_keys_in_config(self): params = Params( { "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, }, "pytorch_seed": 42, "numpy_seed": 42, "random_seed": 42, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "iterator": {"type": "basic", "batch_size": 2}, "trainer": {"num_epochs": 2, "optimizer": "adam"}, } ) serialization_dir = os.path.join(self.TEST_DIR, "test_train_model") params_as_dict = ( params.as_ordered_dict() ) # Do it here as train_model will pop all the values. train_model(params, serialization_dir=serialization_dir) config_path = os.path.join(serialization_dir, CONFIG_NAME) with open(config_path, "r") as config: saved_config_as_dict = OrderedDict(json.load(config)) assert params_as_dict == saved_config_as_dict
def test_extra_files(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = { "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') } archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(serialization_dir / 'model.tar.gz') params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get('train_data_path').endswith('/fta/train_data_path') # The temporary path should be accessible even after the load_archive # function returns. assert os.path.exists(params.get('train_data_path')) # The validation data path should be the same though. assert params.get('validation_data_path') == str( self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
def test_train_with_test_set(self): params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "test_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv', "evaluate_on_test": True, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'train_with_test_set'))
def test_invalid_include_in_archive(self): self.params["include_in_archive"] = [CONFIG_NAME] serialization_dir = self.TEST_DIR / "serialization" with pytest.raises(ConfigurationError) as exc: train_model(self.params, serialization_dir=serialization_dir) assert "are saved names and cannot be used" in str(exc.value)
def test_dry_run_doesnt_overwrite_vocab(self): vocab_path = self.TEST_DIR / "vocabulary" os.mkdir(vocab_path) # Put something in the vocab directory with open(vocab_path / "test.txt", "a+") as open_file: open_file.write("test") # It should raise error if vocab dir is non-empty with pytest.raises(ConfigurationError): train_model(self.params, self.TEST_DIR, dry_run=True)
def test_fine_tune_does_not_expand_vocab_by_default(self): params = Params.from_file(self.config_file) # snli2 has a new token in it params["train_data_path"] = str(self.FIXTURES_ROOT / "data" / "snli2.jsonl") model = load_archive(self.model_archive).model # By default, no vocab expansion. train_model(params, self.serialization_dir, model=model)
def test_archive_model_uses_archive_path(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Use a new path. archive_model(serialization_dir=serialization_dir, archive_path=serialization_dir / "new_path.tar.gz") archive = load_archive(serialization_dir / 'new_path.tar.gz') assert archive
def test_fine_tune_extended_model_is_loadable(self): params = Params.from_file(self.config_file) # snli2 has a new token (seahorse) in it params["train_data_path"] = str(self.FIXTURES_ROOT / "data" / "snli2.jsonl") trained_model = load_archive(self.model_archive).model shutil.rmtree(self.serialization_dir, ignore_errors=True) train_model( params.duplicate(), self.serialization_dir, model=trained_model, extend_vocab=True ) # self.serialization_dir = str(self.TEST_DIR / 'fine_tune') load_archive(str(self.TEST_DIR / "fine_tune" / "model.tar.gz"))
def train_fixture_gpu(config_file: str, serialization_dir: str) -> None: params = Params.from_file(config_file) params["trainer"]["cuda_device"] = 0 # train this one to a tempdir tempdir = tempfile.gettempdir() train_model(params, tempdir) # now copy back the weights and and archived model shutil.copy(os.path.join(tempdir, "best.th"), os.path.join(serialization_dir, "best_gpu.th")) shutil.copy(os.path.join(tempdir, "model.tar.gz"), os.path.join(serialization_dir, "model_gpu.tar.gz"))
def test_train_nograd_regex(self): params_get = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "iterator": { "type": "basic", "batch_size": 2 }, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) serialization_dir = os.path.join(self.TEST_DIR, 'test_train_nograd') regex_lists = [[], [".*text_field_embedder.*"], [".*text_field_embedder.*", ".*encoder.*"]] for regex_list in regex_lists: params = params_get() params["trainer"]["no_grad"] = regex_list shutil.rmtree(serialization_dir, ignore_errors=True) model = train_model(params, serialization_dir=serialization_dir) # If regex is matched, parameter name should have requires_grad False # Or else True for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad # If all parameters have requires_grad=False, then error. params = params_get() params["trainer"]["no_grad"] = ["*"] shutil.rmtree(serialization_dir, ignore_errors=True) with pytest.raises(Exception) as _: model = train_model(params, serialization_dir=serialization_dir)
def test_trainer_can_run_from_params(self): # pylint: disable=bad-continuation from allennlp.commands.train import train_model params = Params({ "trainer": { "type": "callback", "optimizer": {"type": "sgd", "lr": 0.01, "momentum": 0.9}, "num_epochs": 2, "callbacks": [ "generate_training_batches", "train_supervised", "checkpoint", "track_metrics", "validate", {"type": "log_to_tensorboard", "log_batch_size_period": 10} ] }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "iterator": {"type": "basic", "batch_size": 2} }) train_model(params, self.TEST_DIR) with open(self.TEST_DIR / 'metrics.json') as f: metrics = json.load(f) assert 'best_validation_loss' in metrics assert isinstance(metrics['best_validation_loss'], float) assert 'best_validation_accuracy' in metrics assert isinstance(metrics['best_validation_accuracy'], float) assert 'best_validation_accuracy3' in metrics assert isinstance(metrics['best_validation_accuracy3'], float) assert 'best_epoch' in metrics assert isinstance(metrics['best_epoch'], int)
def test_force_cpu(self): import copy params = copy.deepcopy(self.DEFAULT_PARAMS) params["trainer"]["batch_callbacks"] = ["training_device_logger"] params["trainer"]["cuda_device"] = -1 global _seen_training_devices _seen_training_devices.clear() train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_force_cpu")) assert len(_seen_training_devices) == 1 seen_training_device = next(iter(_seen_training_devices)) assert seen_training_device.type == "cpu"
def train_fixture_gpu(config_prefix: str) -> None: config_file = config_prefix + 'experiment.json' serialization_dir = config_prefix + 'serialization' params = Params.from_file(config_file) params["trainer"]["cuda_device"] = 0 # train this one to a tempdir tempdir = tempfile.gettempdir() train_model(params, tempdir) # now copy back the weights and and archived model shutil.copy(os.path.join(tempdir, "best.th"), os.path.join(serialization_dir, "best_gpu.th")) shutil.copy(os.path.join(tempdir, "model.tar.gz"), os.path.join(serialization_dir, "model_gpu.tar.gz"))
def test_train_nograd_regex(self): params_get = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) serialization_dir = os.path.join(self.TEST_DIR, 'test_train_nograd') regex_lists = [[], [".*text_field_embedder.*"], [".*text_field_embedder.*", ".*encoder.*"]] for regex_list in regex_lists: params = params_get() params["trainer"]["no_grad"] = regex_list shutil.rmtree(serialization_dir, ignore_errors=True) model = train_model(params, serialization_dir=serialization_dir) # If regex is matched, parameter name should have requires_grad False # Or else True for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad # If all parameters have requires_grad=False, then error. params = params_get() params["trainer"]["no_grad"] = ["*"] shutil.rmtree(serialization_dir, ignore_errors=True) with pytest.raises(Exception) as _: model = train_model(params, serialization_dir=serialization_dir)
def test_include_in_archive(self): self.params["include_in_archive"] = ["metrics_epoch_*.json"] serialization_dir = self.TEST_DIR / "serialization" # Train a model train_model(self.params, serialization_dir=serialization_dir) # Assert that the additional targets were archived with tempfile.TemporaryDirectory() as tempdir: with tarfile.open(serialization_dir / "model.tar.gz", "r:gz") as archive: archive.extractall(tempdir) assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_0.json")) assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_1.json")) assert not os.path.isfile(os.path.join(tempdir, "metrics.json"))
def main(): parser = argparse.ArgumentParser(description = "Train QA-SRL model variants.") parser.add_argument("models_root", metavar = "path", type = str, help = "Path to root of model variants") parser.add_argument("models_branch", metavar = "path", type = str, help = "Path to config file") parser.add_argument("initial_batch_size", metavar = "n", type = int, help = "Batch size to start with before cutting as necessary") args = parser.parse_args() serialization_directory = "/gscratch/cse/julianjm/qasrl-models/" + args.models_branch current_batch_size = args.initial_batch_size done = False while not done: try: if not os.path.exists(serialization_directory + "/current"): print("Starting new training round", flush = True) if not os.path.exists(serialization_directory): os.makedirs(serialization_directory) config_path = args.models_root + "/" + args.models_branch params = Params.from_file(config_path, "") params["trainer"]["num_serialized_models_to_keep"] = 2 params["trainer"]["should_log_parameter_statistics"] = False params["iterator"]["biggest_batch_first"] = True params["iterator"]["batch_size"] = current_batch_size torch.cuda.empty_cache() train_model(params, serialization_directory + "/current", file_friendly_logging = True) done = True else: print("Recovering from a previously preempted run", flush = True) config_path = serialization_directory + "/current/config.json" params = Params.from_file(config_path, "") current_batch_size = params["iterator"]["batch_size"] torch.cuda.empty_cache() train_model(params, serialization_directory + "/current", file_friendly_logging = True, recover = True) done = True except RuntimeError as e: if 'out of memory' in str(e) or "CUDNN_STATUS_NOT_SUPPORTED" in str(e) or "an illegal memory access was encountered" in str(e): print(str(e), flush = True) print('Reducing batch size to %s and retrying' % (current_batch_size / 2), flush = True) subprocess.call(["mv", serialization_directory + "/current", serialization_directory + "/" + str(int(current_batch_size))]) current_batch_size = current_batch_size / 2 torch.cuda.empty_cache() else: raise e print("Finished training.", flush = True)
def test_detect_gpu(self): import copy params = copy.deepcopy(self.DEFAULT_PARAMS) params["trainer"]["callbacks"] = ["training_device_logger"] global _seen_training_devices _seen_training_devices.clear() train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_detect_gpu")) assert len(_seen_training_devices) == 1 seen_training_device = next(iter(_seen_training_devices)) if torch.cuda.device_count() == 0: assert seen_training_device.type == "cpu" else: assert seen_training_device.type == "cuda"
def run_trial(trial_params: Params, train_params: Params, serialization_dir: PathLike, seed: int, recover: Optional[bool] = False, force: Optional[bool] = False, train_only: Optional[bool] = False, re_calculate: Optional[str] = None) -> AttentionCorrelationTrial: _trial_params = deepcopy(trial_params) _train_params = deepcopy(train_params) test_data_path = _train_params['test_data_path'] trial_dir = os.path.join(serialization_dir, f"seed_{seed}") should_train = any( [force, not utils.model_already_trained(trial_dir), recover]) if should_train: _train_params['random_seed'] = seed _train_params['numpy_seed'] = seed _train_params['pytorch_seed'] = seed train_model(params=_train_params, serialization_dir=trial_dir, recover=recover, force=force) if train_only: logger.info( "'train-only' was specified. Finishing without calculating measures." ) return attention_trial = AttentionCorrelationTrial.from_params( params=_trial_params, seed=seed, serialization_dir=trial_dir, test_data_path=test_data_path) recalc_fi = re_calculate == "both" recalc_corr = recalc_fi or re_calculate == "correlation" attention_trial.calculate_feature_importance(force=recalc_fi) attention_trial.calculate_correlation(force=recalc_corr) return attention_trial
def test_train_model(self): params = lambda: Params({ u"model": { u"type": u"simple_tagger", u"text_field_embedder": { u"tokens": { u"type": u"embedding", u"embedding_dim": 5 } }, u"encoder": { u"type": u"lstm", u"input_size": 5, u"hidden_size": 7, u"num_layers": 2 } }, u"dataset_reader": { u"type": u"sequence_tagging" }, u"train_data_path": SEQUENCE_TAGGING_DATA_PATH, u"validation_data_path": SEQUENCE_TAGGING_DATA_PATH, u"iterator": { u"type": u"basic", u"batch_size": 2 }, u"trainer": { u"num_epochs": 2, u"optimizer": u"adam" } }) train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, u'test_train_model')) # It's OK if serialization dir exists but is empty: serialization_dir2 = os.path.join(self.TEST_DIR, u'empty_directory') assert not os.path.exists(serialization_dir2) os.makedirs(serialization_dir2) train_model(params(), serialization_dir=serialization_dir2) # It's not OK if serialization dir exists and has junk in it non-empty: serialization_dir3 = os.path.join(self.TEST_DIR, u'non_empty_directory') assert not os.path.exists(serialization_dir3) os.makedirs(serialization_dir3) with open(os.path.join(serialization_dir3, u'README.md'), u'w') as f: f.write(u"TEST") with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=serialization_dir3) # It's also not OK if serialization dir is a real serialization dir: with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, u'test_train_model'))
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / "existing" extended_serialization_dir = self.TEST_DIR / "extended" existing_vocab_path = existing_serialization_dir / "vocabulary" extended_vocab_path = extended_serialization_dir / "vocabulary" vocab = Vocabulary() vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens") vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens") os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params["vocabulary"] = {} self.params["vocabulary"]["type"] = "extend" self.params["vocabulary"]["directory"] = str(existing_vocab_path) self.params["vocabulary"]["min_count"] = {"tokens": 3} train_model(self.params, extended_serialization_dir, dry_run=True) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == { ".lock", "labels.txt", "non_padded_namespaces.txt", "tokens.txt", } with open(extended_vocab_path / "tokens.txt") as f: tokens = [line.strip() for line in f] assert tokens[0] == "@@UNKNOWN@@" assert tokens[1] == "some_weird_token_1" assert tokens[2] == "some_weird_token_2" tokens.sort() assert tokens == [ ".", "@@UNKNOWN@@", "animals", "are", "some_weird_token_1", "some_weird_token_2", ] with open(extended_vocab_path / "labels.txt") as f: labels = [line.strip() for line in f] labels.sort() assert labels == ["N", "V"]
def test_archiving(self): # copy params, since they'll get consumed during training params_copy = copy.deepcopy(self.params.as_dict()) # `train_model` should create an archive serialization_dir = self.TEST_DIR / 'archive_test' model = train_model(self.params, serialization_dir=serialization_dir) archive_path = serialization_dir / "model.tar.gz" # load from the archive archive = load_archive(archive_path) model2 = archive.model # check that model weights are the same keys = set(model.state_dict().keys()) keys2 = set(model2.state_dict().keys()) assert keys == keys2 for key in keys: assert torch.equal(model.state_dict()[key], model2.state_dict()[key]) # check that vocabularies are the same vocab = model.vocab vocab2 = model2.vocab assert vocab._token_to_index == vocab2._token_to_index # pylint: disable=protected-access assert vocab._index_to_token == vocab2._index_to_token # pylint: disable=protected-access # check that params are the same params2 = archive.config assert params2.as_dict() == params_copy
def test_extra_files(self): serialization_dir = self.TEST_DIR / 'serialization' # Train a model train_model(self.params, serialization_dir=serialization_dir) # Archive model, and also archive the training data files_to_archive = {"train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')} archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive) archive = load_archive(serialization_dir / 'model.tar.gz') params = archive.config # The param in the data should have been replaced with a temporary path # (which we don't know, but we know what it ends with). assert params.get('train_data_path').endswith('/fta/train_data_path') # The validation data path should be the same though. assert params.get('validation_data_path') == str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
def test_train_model(self): params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "tokens": { "type": "embedding", "embedding_dim": 5 } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "num_epochs": 2, "optimizer": "adam" } }) train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model')) # It's OK if serialization dir exists but is empty: serialization_dir2 = os.path.join(self.TEST_DIR, 'empty_directory') assert not os.path.exists(serialization_dir2) os.makedirs(serialization_dir2) train_model(params(), serialization_dir=serialization_dir2) # It's not OK if serialization dir exists and has junk in it non-empty: serialization_dir3 = os.path.join(self.TEST_DIR, 'non_empty_directory') assert not os.path.exists(serialization_dir3) os.makedirs(serialization_dir3) with open(os.path.join(serialization_dir3, 'README.md'), 'w') as f: f.write("TEST") with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=serialization_dir3) # It's also not OK if serialization dir is a real serialization dir: with pytest.raises(ConfigurationError): train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
embeddings = self.word_embeddings(sentence) encoder_out = self.encoder(embeddings, mask) tag_logits = self.hidden2tag(encoder_out) output = {"tag_logits": tag_logits} if labels is not None: self.accuracy(tag_logits, labels, mask) output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: return {"accuracy": self.accuracy.get_metric(reset)} # In practice you'd probably do this from the command line: # $ allennlp train tutorials/tagger/experiment.jsonnet -s /tmp/serialization_dir # if __name__ == "__main__": params = Params.from_file('tutorials/tagger/experiment.jsonnet') serialization_dir = tempfile.mkdtemp() model = train_model(params, serialization_dir) # Make predictions predictor = SentenceTaggerPredictor(model, dataset_reader=PosDatasetReader()) tag_logits = predictor.predict("The dog ate the apple")['tag_logits'] print(tag_logits) tag_ids = np.argmax(tag_logits, axis=-1) print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids]) shutil.rmtree(serialization_dir)