Beispiel #1
0
    def test_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {
            "train_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
        }
        archive_model(serialization_dir=serialization_dir,
                      files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir / 'model.tar.gz')
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The validation data path should be the same though.
        assert params.get('validation_data_path') == str(
            self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
Beispiel #2
0
    def test_error_is_throw_when_cuda_device_is_not_available(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "cuda_device": torch.cuda.device_count(),
                        "optimizer": "adam"
                }
        })

        with pytest.raises(ConfigurationError,
                           message="Experiment specified a GPU but none is available;"
                                   " if you want to run on CPU use the override"
                                   " 'trainer.cuda_device=-1' in the json config file."):
            train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
Beispiel #3
0
    def test_train_with_test_set(self):
        params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "lazy-test"},
                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "test_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "evaluate_on_test": True,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam"
                }
        })

        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, 'lazy_test_set'))
Beispiel #4
0
 def test_train_nograd_regex(self):
     params_get = lambda: Params({
             "model": {
                     "type": "simple_tagger",
                     "text_field_embedder": {
                             "tokens": {
                                     "type": "embedding",
                                     "embedding_dim": 5
                             }
                     },
                     "encoder": {
                             "type": "lstm",
                             "input_size": 5,
                             "hidden_size": 7,
                             "num_layers": 2
                     }
             },
             "dataset_reader": {"type": "sequence_tagging"},
             "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
             "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
             "iterator": {"type": "basic", "batch_size": 2},
             "trainer": {
                     "num_epochs": 2,
                     "optimizer": "adam"
             }
     })
     serialization_dir = os.path.join(self.TEST_DIR, 'test_train_nograd')
     regex_lists = [[],
                    [".*text_field_embedder.*"],
                    [".*text_field_embedder.*", ".*encoder.*"]]
     for regex_list in regex_lists:
         params = params_get()
         params["trainer"]["no_grad"] = regex_list
         shutil.rmtree(serialization_dir, ignore_errors=True)
         model = train_model(params, serialization_dir=serialization_dir)
         # If regex is matched, parameter name should have requires_grad False
         # Or else True
         for name, parameter in model.named_parameters():
             if any(re.search(regex, name) for regex in regex_list):
                 assert not parameter.requires_grad
             else:
                 assert parameter.requires_grad
     # If all parameters have requires_grad=False, then error.
     params = params_get()
     params["trainer"]["no_grad"] = ["*"]
     shutil.rmtree(serialization_dir, ignore_errors=True)
     with pytest.raises(Exception) as _:
         model = train_model(params, serialization_dir=serialization_dir)
Beispiel #5
0
    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / 'archive_test'
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        # check that model weights are the same
        keys = set(model.state_dict().keys())
        keys2 = set(model2.state_dict().keys())

        assert keys == keys2

        for key in keys:
            assert torch.equal(model.state_dict()[key],
                               model2.state_dict()[key])

        # check that vocabularies are the same
        vocab = model.vocab
        vocab2 = model2.vocab

        assert vocab._token_to_index == vocab2._token_to_index  # pylint: disable=protected-access
        assert vocab._index_to_token == vocab2._index_to_token  # pylint: disable=protected-access

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy
Beispiel #6
0
    def test_train_model(self):
        params = lambda: Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam"
                }
        })

        train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))

        # It's OK if serialization dir exists but is empty:
        serialization_dir2 = os.path.join(self.TEST_DIR, 'empty_directory')
        assert not os.path.exists(serialization_dir2)
        os.makedirs(serialization_dir2)
        train_model(params(), serialization_dir=serialization_dir2)

        # It's not OK if serialization dir exists and has junk in it non-empty:
        serialization_dir3 = os.path.join(self.TEST_DIR, 'non_empty_directory')
        assert not os.path.exists(serialization_dir3)
        os.makedirs(serialization_dir3)
        with open(os.path.join(serialization_dir3, 'README.md'), 'w') as f:
            f.write("TEST")

        with pytest.raises(ConfigurationError):
            train_model(params(), serialization_dir=serialization_dir3)

        # It's also not OK if serialization dir is a real serialization dir:
        with pytest.raises(ConfigurationError):
            train_model(params(), serialization_dir=os.path.join(self.TEST_DIR, 'test_train_model'))
Beispiel #7
0
    def test_file_archiving(self):
        # This happens to be a good place to test auxiliary file archiving.
        # Train the model
        params = Params.from_file(self.FIXTURES_ROOT / 'elmo' / 'config' /
                                  'characters_token_embedder.json')
        serialization_dir = os.path.join(self.TEST_DIR, 'serialization')
        train_model(params, serialization_dir)

        # Inspect the archive
        archive_file = os.path.join(serialization_dir, 'model.tar.gz')
        unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive')
        with tarfile.open(archive_file, 'r:gz') as archive:
            archive.extractall(unarchive_dir)

        # It should contain `files_to_archive.json`
        fta_file = os.path.join(unarchive_dir, 'files_to_archive.json')
        assert os.path.exists(fta_file)

        # Which should properly contain { flattened_key -> original_filename }
        with open(fta_file) as fta:
            files_to_archive = json.loads(fta.read())

        assert files_to_archive == {
            'model.text_field_embedder.token_embedders.elmo.options_file':
            str(
                pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' /
                'options.json'),
            'model.text_field_embedder.token_embedders.elmo.weight_file':
            str(
                pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' /
                'lm_weights.hdf5'),
        }

        # Check that the unarchived contents of those files match the original contents.
        for key, original_filename in files_to_archive.items():
            new_filename = os.path.join(unarchive_dir, "fta", key)
            assert filecmp.cmp(original_filename, new_filename)