Example #1
0
    def test_train_model(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "stacked_encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam"
                }
        })

        train_model(params, serialization_dir=self.TEST_DIR)
Example #2
0
def train_model_from_args(args: argparse.Namespace):
    params = Params.from_file(args.param_path, args.overrides)

    params_dict = params.as_flat_dict()
    params_dict.update({"args": vars(args)})
    flattened_params = flatten_dict_for_mlflow_log(params_dict)

    with mlflow.start_run():
        mlflow.log_params(flattened_params)

        serialization_dir = get_serialization_dir(args)

        try:
            train_model(
                params=params,
                serialization_dir=serialization_dir,
                file_friendly_logging=args.file_friendly_logging,
                recover=args.recover,
                force=args.force,
                node_rank=args.node_rank,
                include_package=args.include_package,
                dry_run=args.dry_run,
            )
        finally:
            if not args.dry_run:
                mlflow.log_artifacts(serialization_dir)
Example #3
0
    def test_train_with_test_set(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "token_embedders": {
                                        "tokens": {
                                                "type": "embedding",
                                                "embedding_dim": 5
                                        }
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "lazy-test"},
                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "test_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "evaluate_on_test": True,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam"
                }
        })

        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'lazy_test_set'))
Example #4
0
        def train_func(config, reporter):
            logger.debug(
                f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

            for package_name in getattr(args, "include_package", ()):
                import_submodules(package_name)

            search_space = HyperparameterSearch(**config)
            sample = search_space.sample()
            for k, v in sample.items():
                config[k] = str(v)

            params_dict = json.loads(
                _jsonnet.evaluate_snippet("config",
                                          parameter_file_snippet,
                                          tla_codes={},
                                          ext_vars=config))
            if args.num_gpus == 0:
                logger.warning(f"No GPU specified, using CPU.")
                params_dict["trainer"]["cuda_device"] = -1

            if args.cpus_per_trial > 0:
                torch.set_num_threads(args.cpus_per_trial)

            params = Params(params_dict)

            logger.debug(f"AllenNLP Configuration: {params.as_dict()}")

            train_model(params=params, serialization_dir="trial")

            reporter(done=True)
Example #5
0
    def test_train_with_test_set(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "token_embedders": {
                                        "tokens": {
                                                "type": "embedding",
                                                "embedding_dim": 5
                                        }
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "lazy-test"},
                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "test_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "evaluate_on_test": True,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam"
                }
        })

        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'lazy_test_set'))
Example #6
0
def cmd_train(
    source: pathlib.Path,
    destination: pathlib.Path,
    can_overwrite: bool,
):
    # =========================
    # Prepare the output folder
    # =========================
    source_path_name = pathlib.Path(source).name
    dest_folder_root: pathlib.Path
    if destination:
        dest_folder_root = destination
        # TODO: check if it's empty
    else:
        dest_folder_root = ct.create_folder_time(f"ml_{source_path_name}_",
                                                 to_make=True)
    # === END IF ===

    import os
    os.chdir(source)

    import allennlp.common.params as allp
    import allennlp.common.util as allu
    import allennlp.commands.train as allct
    allu.import_submodules("depccg.models.my_allennlp")

    allct.train_model(params=allp.Params.from_file(
        core.FILES["trainer_settings"]),
                      serialization_dir=dest_folder_root)


# === END ===
Example #7
0
    def test_train_model(self):
        params = lambda: Params({
            "model": {
                "type": "constant"
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "data_loader": {
                "batch_size": 2
            },
            "trainer": {
                "type": "no_op"
            },
        })

        serialization_dir = self.TEST_DIR / "serialization_directory"
        train_model(params(), serialization_dir=serialization_dir)
        archive = load_archive(str(serialization_dir / "model.tar.gz"))
        model = archive.model
        assert model.forward(torch.tensor([1, 2,
                                           3]))["class"] == torch.tensor(98)
        assert model.vocab.get_vocab_size() == 9
    def test_file_archiving(self):
        # This happens to be a good place to test auxiliary file archiving.

        # Train the model
        params = Params.from_file('tests/fixtures/elmo/config/characters_token_embedder.json')
        serialization_dir = os.path.join(self.TEST_DIR, 'serialization')
        train_model(params, serialization_dir)

        # Inspect the archive
        archive_file = os.path.join(serialization_dir, 'model.tar.gz')
        unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive')
        with tarfile.open(archive_file, 'r:gz') as archive:
            archive.extractall(unarchive_dir)

        # It should contain `files_to_archive.json`
        fta_file = os.path.join(unarchive_dir, 'files_to_archive.json')
        assert os.path.exists(fta_file)

        # Which should properly contain { hocon_key -> original_filename }
        with open(fta_file) as fta:
            files_to_archive = json.loads(fta.read())

        assert files_to_archive == {
                'model.text_field_embedder.elmo.options_file': 'tests/fixtures/elmo/options.json',
                'model.text_field_embedder.elmo.weight_file': 'tests/fixtures/elmo/lm_weights.hdf5'
        }

        # Check that the unarchived contents of those files match the original contents.
        for key, original_filename in files_to_archive.items():
            new_filename = os.path.join(unarchive_dir, "fta", key)
            assert filecmp.cmp(original_filename, new_filename)
    def test_file_archiving(self):
        # This happens to be a good place to test auxiliary file archiving.
        # Train the model
        params = Params.from_file(self.FIXTURES_ROOT / 'elmo' / 'config' / 'characters_token_embedder.json')
        serialization_dir = os.path.join(self.TEST_DIR, 'serialization')
        train_model(params, serialization_dir)

        # Inspect the archive
        archive_file = os.path.join(serialization_dir, 'model.tar.gz')
        unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive')
        with tarfile.open(archive_file, 'r:gz') as archive:
            archive.extractall(unarchive_dir)

        # It should contain `files_to_archive.json`
        fta_file = os.path.join(unarchive_dir, 'files_to_archive.json')
        assert os.path.exists(fta_file)

        # Which should properly contain { flattened_key -> original_filename }
        with open(fta_file) as fta:
            files_to_archive = json.loads(fta.read())

        assert files_to_archive == {
                'model.text_field_embedder.token_embedders.elmo.options_file':
                        str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'options.json'),
                'model.text_field_embedder.token_embedders.elmo.weight_file':
                        str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'lm_weights.hdf5'),
        }

        # Check that the unarchived contents of those files match the original contents.
        for key, original_filename in files_to_archive.items():
            new_filename = os.path.join(unarchive_dir, "fta", key)
            assert filecmp.cmp(original_filename, new_filename)
Example #10
0
 def test_fine_tune_nograd_regex(self):
     original_model = load_archive(self.model_archive).model
     name_parameters_original = dict(original_model.named_parameters())
     regex_lists = [
         [],
         [".*attend_feedforward.*", ".*token_embedder.*"],
         [".*compare_feedforward.*"],
     ]
     for regex_list in regex_lists:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = regex_list
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         tuned_model = train_model(
             model=original_model, params=params, serialization_dir=self.serialization_dir
         )
         # If regex is matched, parameter name should have requires_grad False
         # If regex is matched, parameter name should have same requires_grad
         # as the originally loaded model
         for name, parameter in tuned_model.named_parameters():
             if any(re.search(regex, name) for regex in regex_list):
                 assert not parameter.requires_grad
             else:
                 assert parameter.requires_grad == name_parameters_original[name].requires_grad
     # If all parameters have requires_grad=False, then error.
     with pytest.raises(Exception) as _:
         params = Params.from_file(self.config_file)
         params["trainer"]["no_grad"] = ["*"]
         shutil.rmtree(self.serialization_dir, ignore_errors=True)
         train_model(
             model=original_model, params=params, serialization_dir=self.serialization_dir
         )
Example #11
0
    def test_error_is_throw_when_cuda_device_is_not_available(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "cuda_device": torch.cuda.device_count(),
                        "optimizer": "adam"
                }
        })

        with pytest.raises(ConfigurationError,
                           message="Experiment specified a GPU but none is available;"
                                   " if you want to run on CPU use the override"
                                   " 'trainer.cuda_device=-1' in the json config file."):
            train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
Example #12
0
    def test_dry_run_makes_vocab(self):
        vocab_path = self.TEST_DIR / "vocabulary"

        train_model(self.params, self.TEST_DIR, dry_run=True)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            ".lock",
            "labels.txt",
            "non_padded_namespaces.txt",
            "tokens.txt",
        }

        with open(vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            ".", "@@UNKNOWN@@", "animals", "are", "birds", "cats", "dogs",
            "snakes"
        ]

        with open(vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
Example #13
0
    def test_error_is_throw_when_cuda_device_is_not_available(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "cuda_device": torch.cuda.device_count(),
                        "optimizer": "adam"
                }
        })

        with pytest.raises(ConfigurationError, match="Experiment specified"):
            train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
Example #14
0
    def test_train_model_distributed(self):

        params = lambda: Params(
            {
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                    },
                    "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {"num_epochs": 2, "optimizer": "adam"},
                "distributed": {"cuda_devices": [0, 1]},
            }
        )

        out_dir = os.path.join(self.TEST_DIR, "test_distributed_train")
        train_model(params(), serialization_dir=out_dir)

        # Check that some logs specific to distributed
        # training are where we expect.
        serialized_files = os.listdir(out_dir)
        assert "stderr_worker0.log" in serialized_files
        assert "stdout_worker0.log" in serialized_files
        assert "stderr_worker1.log" in serialized_files
        assert "stdout_worker1.log" in serialized_files
        assert "model.tar.gz" in serialized_files

        # Check we can load the seralized model
        assert load_archive(out_dir).model
Example #15
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        vocab.add_token_to_namespace("N", namespace="labels")
        vocab.add_token_to_namespace("V", namespace="labels")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["type"] = "from_files"
        self.params["vocabulary"]["directory"] = str(existing_vocab_path)
        train_model(self.params, extended_serialization_dir, dry_run=True)

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"
        assert len(tokens) == 3
Example #16
0
    def test_train_model(self):
        params = Params({
            u"model": {
                u"type": u"simple_tagger",
                u"text_field_embedder": {
                    u"tokens": {
                        u"type": u"embedding",
                        u"embedding_dim": 5
                    }
                },
                u"encoder": {
                    u"type": u"lstm",
                    u"input_size": 5,
                    u"hidden_size": 7,
                    u"num_layers": 2
                }
            },
            u"dataset_reader": {
                u"type": u"lazy-test"
            },
            u"train_data_path": SEQUENCE_TAGGING_DATA_PATH,
            u"validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
            u"iterator": {
                u"type": u"basic",
                u"batch_size": 2
            },
            u"trainer": {
                u"num_epochs": 2,
                u"optimizer": u"adam"
            }
        })

        train_model(params,
                    serialization_dir=os.path.join(self.TEST_DIR,
                                                   u'train_lazy_model'))
Example #17
0
    def test_train_saves_all_keys_in_config(self):
        params = Params(
            {
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                    },
                    "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                },
                "pytorch_seed": 42,
                "numpy_seed": 42,
                "random_seed": 42,
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {"num_epochs": 2, "optimizer": "adam"},
            }
        )

        serialization_dir = os.path.join(self.TEST_DIR, "test_train_model")
        params_as_dict = (
            params.as_ordered_dict()
        )  # Do it here as train_model will pop all the values.
        train_model(params, serialization_dir=serialization_dir)

        config_path = os.path.join(serialization_dir, CONFIG_NAME)
        with open(config_path, "r") as config:
            saved_config_as_dict = OrderedDict(json.load(config))
        assert params_as_dict == saved_config_as_dict
Example #18
0
    def test_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {
            "train_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
        }
        archive_model(serialization_dir=serialization_dir,
                      files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir / 'model.tar.gz')
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The temporary path should be accessible even after the load_archive
        # function returns.
        assert os.path.exists(params.get('train_data_path'))

        # The validation data path should be the same though.
        assert params.get('validation_data_path') == str(
            self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
Example #19
0
    def test_train_with_test_set(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "test_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "evaluate_on_test": True,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam"
                }
        })

        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'train_with_test_set'))
Example #20
0
    def test_invalid_include_in_archive(self):
        self.params["include_in_archive"] = [CONFIG_NAME]

        serialization_dir = self.TEST_DIR / "serialization"

        with pytest.raises(ConfigurationError) as exc:
            train_model(self.params, serialization_dir=serialization_dir)
            assert "are saved names and cannot be used" in str(exc.value)
Example #21
0
 def test_dry_run_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / "vocabulary"
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         train_model(self.params, self.TEST_DIR, dry_run=True)
Example #22
0
    def test_fine_tune_does_not_expand_vocab_by_default(self):
        params = Params.from_file(self.config_file)
        # snli2 has a new token in it
        params["train_data_path"] = str(self.FIXTURES_ROOT / "data" / "snli2.jsonl")

        model = load_archive(self.model_archive).model

        # By default, no vocab expansion.
        train_model(params, self.serialization_dir, model=model)
Example #23
0
    def test_archive_model_uses_archive_path(self):

        serialization_dir = self.TEST_DIR / 'serialization'
        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)
        # Use a new path.
        archive_model(serialization_dir=serialization_dir,
                      archive_path=serialization_dir / "new_path.tar.gz")
        archive = load_archive(serialization_dir / 'new_path.tar.gz')
        assert archive
Example #24
0
 def test_fine_tune_extended_model_is_loadable(self):
     params = Params.from_file(self.config_file)
     # snli2 has a new token (seahorse) in it
     params["train_data_path"] = str(self.FIXTURES_ROOT / "data" / "snli2.jsonl")
     trained_model = load_archive(self.model_archive).model
     shutil.rmtree(self.serialization_dir, ignore_errors=True)
     train_model(
         params.duplicate(), self.serialization_dir, model=trained_model, extend_vocab=True
     )
     # self.serialization_dir = str(self.TEST_DIR / 'fine_tune')
     load_archive(str(self.TEST_DIR / "fine_tune" / "model.tar.gz"))
Example #25
0
def train_fixture_gpu(config_file: str, serialization_dir: str) -> None:
    params = Params.from_file(config_file)
    params["trainer"]["cuda_device"] = 0

    # train this one to a tempdir
    tempdir = tempfile.gettempdir()
    train_model(params, tempdir)

    # now copy back the weights and and archived model
    shutil.copy(os.path.join(tempdir, "best.th"), os.path.join(serialization_dir, "best_gpu.th"))
    shutil.copy(os.path.join(tempdir, "model.tar.gz"), os.path.join(serialization_dir, "model_gpu.tar.gz"))
Example #26
0
 def test_train_nograd_regex(self):
     params_get = lambda: Params({
         "model": {
             "type": "simple_tagger",
             "text_field_embedder": {
                 "token_embedders": {
                     "tokens": {
                         "type": "embedding",
                         "embedding_dim": 5
                     }
                 }
             },
             "encoder": {
                 "type": "lstm",
                 "input_size": 5,
                 "hidden_size": 7,
                 "num_layers": 2
             }
         },
         "dataset_reader": {
             "type": "sequence_tagging"
         },
         "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
         "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
         "iterator": {
             "type": "basic",
             "batch_size": 2
         },
         "trainer": {
             "num_epochs": 2,
             "optimizer": "adam"
         }
     })
     serialization_dir = os.path.join(self.TEST_DIR, 'test_train_nograd')
     regex_lists = [[], [".*text_field_embedder.*"],
                    [".*text_field_embedder.*", ".*encoder.*"]]
     for regex_list in regex_lists:
         params = params_get()
         params["trainer"]["no_grad"] = regex_list
         shutil.rmtree(serialization_dir, ignore_errors=True)
         model = train_model(params, serialization_dir=serialization_dir)
         # If regex is matched, parameter name should have requires_grad False
         # Or else True
         for name, parameter in model.named_parameters():
             if any(re.search(regex, name) for regex in regex_list):
                 assert not parameter.requires_grad
             else:
                 assert parameter.requires_grad
     # If all parameters have requires_grad=False, then error.
     params = params_get()
     params["trainer"]["no_grad"] = ["*"]
     shutil.rmtree(serialization_dir, ignore_errors=True)
     with pytest.raises(Exception) as _:
         model = train_model(params, serialization_dir=serialization_dir)
    def test_trainer_can_run_from_params(self):
        # pylint: disable=bad-continuation
        from allennlp.commands.train import train_model

        params = Params({
                "trainer": {
                    "type": "callback",
                    "optimizer": {"type": "sgd", "lr": 0.01, "momentum": 0.9},
                    "num_epochs": 2,
                    "callbacks": [
                        "generate_training_batches",
                        "train_supervised",
                        "checkpoint",
                        "track_metrics",
                        "validate",
                        {"type": "log_to_tensorboard", "log_batch_size_period": 10}
                    ]
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {
                            "tokens": {
                                "type": "embedding",
                                "embedding_dim": 5
                            }
                        }
                    },
                    "encoder": {
                        "type": "lstm",
                        "input_size": 5,
                        "hidden_size": 7,
                        "num_layers": 2
                    }
                },
                "iterator": {"type": "basic", "batch_size": 2}
        })

        train_model(params, self.TEST_DIR)
        with open(self.TEST_DIR / 'metrics.json') as f:
            metrics = json.load(f)
        assert 'best_validation_loss' in metrics
        assert isinstance(metrics['best_validation_loss'], float)
        assert 'best_validation_accuracy' in metrics
        assert isinstance(metrics['best_validation_accuracy'], float)
        assert 'best_validation_accuracy3' in metrics
        assert isinstance(metrics['best_validation_accuracy3'], float)
        assert 'best_epoch' in metrics
        assert isinstance(metrics['best_epoch'], int)
Example #28
0
    def test_force_cpu(self):
        import copy

        params = copy.deepcopy(self.DEFAULT_PARAMS)
        params["trainer"]["batch_callbacks"] = ["training_device_logger"]
        params["trainer"]["cuda_device"] = -1

        global _seen_training_devices
        _seen_training_devices.clear()
        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_force_cpu"))
        assert len(_seen_training_devices) == 1
        seen_training_device = next(iter(_seen_training_devices))
        assert seen_training_device.type == "cpu"
Example #29
0
def train_fixture_gpu(config_prefix: str) -> None:
    config_file = config_prefix + 'experiment.json'
    serialization_dir = config_prefix + 'serialization'
    params = Params.from_file(config_file)
    params["trainer"]["cuda_device"] = 0

    # train this one to a tempdir
    tempdir = tempfile.gettempdir()
    train_model(params, tempdir)

    # now copy back the weights and and archived model
    shutil.copy(os.path.join(tempdir, "best.th"), os.path.join(serialization_dir, "best_gpu.th"))
    shutil.copy(os.path.join(tempdir, "model.tar.gz"), os.path.join(serialization_dir, "model_gpu.tar.gz"))
Example #30
0
 def test_train_nograd_regex(self):
     params_get = lambda: Params({
             "model": {
                     "type": "simple_tagger",
                     "text_field_embedder": {
                             "token_embedders": {
                                     "tokens": {
                                             "type": "embedding",
                                             "embedding_dim": 5
                                     }
                             }
                     },
                     "encoder": {
                             "type": "lstm",
                             "input_size": 5,
                             "hidden_size": 7,
                             "num_layers": 2
                     }
             },
             "dataset_reader": {"type": "sequence_tagging"},
             "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
             "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
             "iterator": {"type": "basic", "batch_size": 2},
             "trainer": {
                     "num_epochs": 2,
                     "optimizer": "adam"
             }
     })
     serialization_dir = os.path.join(self.TEST_DIR, 'test_train_nograd')
     regex_lists = [[],
                    [".*text_field_embedder.*"],
                    [".*text_field_embedder.*", ".*encoder.*"]]
     for regex_list in regex_lists:
         params = params_get()
         params["trainer"]["no_grad"] = regex_list
         shutil.rmtree(serialization_dir, ignore_errors=True)
         model = train_model(params, serialization_dir=serialization_dir)
         # If regex is matched, parameter name should have requires_grad False
         # Or else True
         for name, parameter in model.named_parameters():
             if any(re.search(regex, name) for regex in regex_list):
                 assert not parameter.requires_grad
             else:
                 assert parameter.requires_grad
     # If all parameters have requires_grad=False, then error.
     params = params_get()
     params["trainer"]["no_grad"] = ["*"]
     shutil.rmtree(serialization_dir, ignore_errors=True)
     with pytest.raises(Exception) as _:
         model = train_model(params, serialization_dir=serialization_dir)
Example #31
0
    def test_include_in_archive(self):
        self.params["include_in_archive"] = ["metrics_epoch_*.json"]

        serialization_dir = self.TEST_DIR / "serialization"
        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Assert that the additional targets were archived
        with tempfile.TemporaryDirectory() as tempdir:
            with tarfile.open(serialization_dir / "model.tar.gz", "r:gz") as archive:
                archive.extractall(tempdir)
            assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_0.json"))
            assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_1.json"))
            assert not os.path.isfile(os.path.join(tempdir, "metrics.json"))
Example #32
0
def main():
    parser = argparse.ArgumentParser(description = "Train QA-SRL model variants.")
    parser.add_argument("models_root", metavar = "path", type = str, help = "Path to root of model variants")
    parser.add_argument("models_branch", metavar = "path", type = str, help = "Path to config file")
    parser.add_argument("initial_batch_size", metavar = "n", type = int, help = "Batch size to start with before cutting as necessary")
    args = parser.parse_args()

    serialization_directory = "/gscratch/cse/julianjm/qasrl-models/" + args.models_branch
    current_batch_size = args.initial_batch_size
    done = False
    while not done:
        try:
            if not os.path.exists(serialization_directory + "/current"):
                print("Starting new training round", flush = True)
                if not os.path.exists(serialization_directory):
                    os.makedirs(serialization_directory)
                config_path = args.models_root + "/" + args.models_branch
                params = Params.from_file(config_path, "")
                params["trainer"]["num_serialized_models_to_keep"] = 2
                params["trainer"]["should_log_parameter_statistics"] = False
                params["iterator"]["biggest_batch_first"] = True
                params["iterator"]["batch_size"] = current_batch_size
                torch.cuda.empty_cache()
                train_model(params,
                            serialization_directory + "/current",
                            file_friendly_logging = True)
                done = True
            else:
                print("Recovering from a previously preempted run", flush = True)
                config_path = serialization_directory + "/current/config.json"
                params = Params.from_file(config_path, "")
                current_batch_size = params["iterator"]["batch_size"]
                torch.cuda.empty_cache()
                train_model(params,
                            serialization_directory + "/current",
                            file_friendly_logging = True,
                            recover = True)
                done = True
        except RuntimeError as e:

            if 'out of memory' in str(e) or "CUDNN_STATUS_NOT_SUPPORTED" in str(e) or "an illegal memory access was encountered" in str(e):
                print(str(e), flush = True)
                print('Reducing batch size to %s and retrying' % (current_batch_size / 2), flush = True)
                subprocess.call(["mv", serialization_directory + "/current", serialization_directory + "/" + str(int(current_batch_size))])
                current_batch_size = current_batch_size / 2
                torch.cuda.empty_cache()
            else:
                raise e
    print("Finished training.", flush = True)
Example #33
0
    def test_detect_gpu(self):
        import copy

        params = copy.deepcopy(self.DEFAULT_PARAMS)
        params["trainer"]["callbacks"] = ["training_device_logger"]

        global _seen_training_devices
        _seen_training_devices.clear()
        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "test_detect_gpu"))
        assert len(_seen_training_devices) == 1
        seen_training_device = next(iter(_seen_training_devices))
        if torch.cuda.device_count() == 0:
            assert seen_training_device.type == "cpu"
        else:
            assert seen_training_device.type == "cuda"
def run_trial(trial_params: Params,
              train_params: Params,
              serialization_dir: PathLike,
              seed: int,
              recover: Optional[bool] = False,
              force: Optional[bool] = False,
              train_only: Optional[bool] = False,
              re_calculate: Optional[str] = None) -> AttentionCorrelationTrial:

    _trial_params = deepcopy(trial_params)
    _train_params = deepcopy(train_params)

    test_data_path = _train_params['test_data_path']

    trial_dir = os.path.join(serialization_dir, f"seed_{seed}")

    should_train = any(
        [force, not utils.model_already_trained(trial_dir), recover])

    if should_train:
        _train_params['random_seed'] = seed
        _train_params['numpy_seed'] = seed
        _train_params['pytorch_seed'] = seed

        train_model(params=_train_params,
                    serialization_dir=trial_dir,
                    recover=recover,
                    force=force)

    if train_only:
        logger.info(
            "'train-only' was specified. Finishing without calculating measures."
        )
        return

    attention_trial = AttentionCorrelationTrial.from_params(
        params=_trial_params,
        seed=seed,
        serialization_dir=trial_dir,
        test_data_path=test_data_path)

    recalc_fi = re_calculate == "both"
    recalc_corr = recalc_fi or re_calculate == "correlation"

    attention_trial.calculate_feature_importance(force=recalc_fi)
    attention_trial.calculate_correlation(force=recalc_corr)

    return attention_trial
Example #35
0
    def test_train_model(self):
        params = lambda: Params({
            u"model": {
                u"type": u"simple_tagger",
                u"text_field_embedder": {
                    u"tokens": {
                        u"type": u"embedding",
                        u"embedding_dim": 5
                    }
                },
                u"encoder": {
                    u"type": u"lstm",
                    u"input_size": 5,
                    u"hidden_size": 7,
                    u"num_layers": 2
                }
            },
            u"dataset_reader": {
                u"type": u"sequence_tagging"
            },
            u"train_data_path": SEQUENCE_TAGGING_DATA_PATH,
            u"validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
            u"iterator": {
                u"type": u"basic",
                u"batch_size": 2
            },
            u"trainer": {
                u"num_epochs": 2,
                u"optimizer": u"adam"
            }
        })

        train_model(params(),
                    serialization_dir=os.path.join(self.TEST_DIR,
                                                   u'test_train_model'))

        # It's OK if serialization dir exists but is empty:
        serialization_dir2 = os.path.join(self.TEST_DIR, u'empty_directory')
        assert not os.path.exists(serialization_dir2)
        os.makedirs(serialization_dir2)
        train_model(params(), serialization_dir=serialization_dir2)

        # It's not OK if serialization dir exists and has junk in it non-empty:
        serialization_dir3 = os.path.join(self.TEST_DIR,
                                          u'non_empty_directory')
        assert not os.path.exists(serialization_dir3)
        os.makedirs(serialization_dir3)
        with open(os.path.join(serialization_dir3, u'README.md'), u'w') as f:
            f.write(u"TEST")

        with pytest.raises(ConfigurationError):
            train_model(params(), serialization_dir=serialization_dir3)

        # It's also not OK if serialization dir is a real serialization dir:
        with pytest.raises(ConfigurationError):
            train_model(params(),
                        serialization_dir=os.path.join(self.TEST_DIR,
                                                       u'test_train_model'))
Example #36
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / "existing"
        extended_serialization_dir = self.TEST_DIR / "extended"
        existing_vocab_path = existing_serialization_dir / "vocabulary"
        extended_vocab_path = extended_serialization_dir / "vocabulary"

        vocab = Vocabulary()
        vocab.add_token_to_namespace("some_weird_token_1", namespace="tokens")
        vocab.add_token_to_namespace("some_weird_token_2", namespace="tokens")
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params["vocabulary"] = {}
        self.params["vocabulary"]["type"] = "extend"
        self.params["vocabulary"]["directory"] = str(existing_vocab_path)
        self.params["vocabulary"]["min_count"] = {"tokens": 3}
        train_model(self.params, extended_serialization_dir, dry_run=True)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {
            ".lock",
            "labels.txt",
            "non_padded_namespaces.txt",
            "tokens.txt",
        }

        with open(extended_vocab_path / "tokens.txt") as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == "@@UNKNOWN@@"
        assert tokens[1] == "some_weird_token_1"
        assert tokens[2] == "some_weird_token_2"

        tokens.sort()
        assert tokens == [
            ".",
            "@@UNKNOWN@@",
            "animals",
            "are",
            "some_weird_token_1",
            "some_weird_token_2",
        ]

        with open(extended_vocab_path / "labels.txt") as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ["N", "V"]
Example #37
0
    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / 'archive_test'
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        # check that model weights are the same
        keys = set(model.state_dict().keys())
        keys2 = set(model2.state_dict().keys())

        assert keys == keys2

        for key in keys:
            assert torch.equal(model.state_dict()[key], model2.state_dict()[key])

        # check that vocabularies are the same
        vocab = model.vocab
        vocab2 = model2.vocab

        assert vocab._token_to_index == vocab2._token_to_index  # pylint: disable=protected-access
        assert vocab._index_to_token == vocab2._index_to_token  # pylint: disable=protected-access

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy
Example #38
0
    def test_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {"train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')}
        archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir / 'model.tar.gz')
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The validation data path should be the same though.
        assert params.get('validation_data_path') == str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
Example #39
0
    def test_train_model(self):
        params = lambda: Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam"
                }
        })

        train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))

        # It's OK if serialization dir exists but is empty:
        serialization_dir2 = os.path.join(self.TEST_DIR, 'empty_directory')
        assert not os.path.exists(serialization_dir2)
        os.makedirs(serialization_dir2)
        train_model(params(), serialization_dir=serialization_dir2)

        # It's not OK if serialization dir exists and has junk in it non-empty:
        serialization_dir3 = os.path.join(self.TEST_DIR, 'non_empty_directory')
        assert not os.path.exists(serialization_dir3)
        os.makedirs(serialization_dir3)
        with open(os.path.join(serialization_dir3, 'README.md'), 'w') as f:
            f.write("TEST")

        with pytest.raises(ConfigurationError):
            train_model(params(), serialization_dir=serialization_dir3)

        # It's also not OK if serialization dir is a real serialization dir:
        with pytest.raises(ConfigurationError):
            train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
Example #40
0
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}

        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

# In practice you'd probably do this from the command line:
#   $ allennlp train tutorials/tagger/experiment.jsonnet -s /tmp/serialization_dir
#
if __name__ == "__main__":
    params = Params.from_file('tutorials/tagger/experiment.jsonnet')
    serialization_dir = tempfile.mkdtemp()
    model = train_model(params, serialization_dir)

    # Make predictions
    predictor = SentenceTaggerPredictor(model, dataset_reader=PosDatasetReader())
    tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
    print(tag_logits)
    tag_ids = np.argmax(tag_logits, axis=-1)
    print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

    shutil.rmtree(serialization_dir)