コード例 #1
0
ファイル: __init__.py プロジェクト: Jordan-Sauchuk/allennlp
 def from_params(self, params: Params) -> PytorchSeq2SeqWrapper:
     if not params.pop_bool('batch_first', True):
         raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params['batch_first'] = True
     module = self._module_class(**params.as_dict())
     return PytorchSeq2SeqWrapper(module)
コード例 #2
0
 def from_params(self, params: Params) -> PytorchSeq2VecWrapper:
     if not params.pop("batch_first", True):
         raise ConfigurationError(
             "Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params["batch_first"] = True
     module = self._module_class(**params.as_dict(infer_type_and_cast=True))
     return PytorchSeq2VecWrapper(module)
コード例 #3
0
 def from_params(self, params: Params, **extras) -> PytorchSeq2SeqWrapper:
     if not params.pop_bool("batch_first", True):
         raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params["batch_first"] = True
     stateful = params.pop_bool("stateful", False)
     module = self._module_class(**params.as_dict(infer_type_and_cast=True))
     return PytorchSeq2SeqWrapper(module, stateful=stateful)
コード例 #4
0
 def from_params(self, params: Params) -> PytorchSeq2SeqWrapper:
     if not params.pop_bool('batch_first', True):
         raise ConfigurationError(
             "Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params['batch_first'] = True
     module = self._module_class(**params.as_dict())
     return PytorchSeq2SeqWrapper(module, self._stateful)
コード例 #5
0
ファイル: data_iterator.py プロジェクト: panyang/allennlp
    def from_params(cls, params: Params):
        from allennlp.experiments.registry import Registry
        # TODO(Mark): The adaptive iterator will need a bit of work here,
        # to retrieve the scaling function etc.

        iterator_type = params.pop_choice("type",
                                          Registry.list_data_iterators())
        return Registry.get_data_iterator(iterator_type)(
            **params.as_dict())  # type: ignore
コード例 #6
0
def write_config_to_file(filepath: str, config: Params) -> None:
    """Writes the config to a json file, specifed by filepath
    """
    with io.open(filepath, 'w', encoding='utf-8', errors='ignore') as fd:
        json.dump(fp=fd,
                  obj=config.as_dict(quiet=True),
                  ensure_ascii=False,
                  indent=4,
                  sort_keys=True)
コード例 #7
0
 def from_params(cls, model_parameters: List[torch.nn.Parameter],
                 params: Params):
     if isinstance(params, str):
         optimizer = params
         params = Params({})
     else:
         optimizer = params.pop_choice("type", Optimizer.list_available())
     return Optimizer.by_name(optimizer)(model_parameters,
                                         **params.as_dict())  # type: ignore
コード例 #8
0
def get_predictor(predictor_name: str, params: Params, archive: str):
    cuda_device = params["trainer"]["cuda_device"]

    check_for_gpu(cuda_device)
    archive = load_archive(archive,
                           cuda_device=cuda_device,
                           overrides=json.dumps(params.as_dict()))

    predictor = Predictor.from_archive(archive, predictor_name)
    return predictor
コード例 #9
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
 def test_embedding_vocab_extension_raises_error_for_incorrect_vocab(self):
     # When vocab namespace of extension vocab is smaller than embeddings
     # it should raise configuration error.
     vocab = Vocabulary({"tokens": {"word1": 1, "word2": 1}})
     embedding_params = Params({
         "vocab_namespace": "tokens",
         "embedding_dim": 10
     })
     embedder = Embedding.from_vocab_or_file(
         vocab, **embedding_params.as_dict(quiet=True))
     with pytest.raises(ConfigurationError):
         embedder.extend_vocab(Vocabulary(), "tokens")
コード例 #10
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
            "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
            "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
            "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
            "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]),
        }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
            "pretrained_file":
            str(self.FIXTURES_ROOT / "embeddings/multi-file-archive.zip"),
            "embedding_dim":
            5,
        })
        with pytest.raises(
                ValueError,
                match=
                "The archive .*/embeddings/multi-file-archive.zip contains multiple files, "
                "so you must select one of the files inside "
                "providing a uri of the type: "
                "\\(path_or_url_to_archive\\)#path_inside_archive\\.",
        ):
            Embedding.from_vocab_or_file(vocab, **params.as_dict(quiet=True))

        for ext in [".zip", ".tar.gz"]:
            archive_path = str(
                self.FIXTURES_ROOT / "embeddings/multi-file-archive") + ext
            file_uri = format_embeddings_file_uri(
                archive_path, "folder/fake_embeddings.5d.txt")
            params = Params({"pretrained_file": file_uri, "embedding_dim": 5})
            embeddings = Embedding.from_vocab_or_file(
                vocab, **params.as_dict(quiet=True)).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i],
                                   vec), "Problem with format " + archive_path
コード例 #11
0
    def from_params(cls, params: Params) -> 'RNNEncoder':
        module = params.pop('module').lower()
        if module == 'lstm':
            module_class = nn.LSTM
        elif module == 'gru':
            module_class = nn.GRU
        elif module == 'rnn':
            module_class = nn.RNN
        else:
            raise ConfigurationError("Unsupported module type")

        module = module_class(**params.as_dict())
        return RNNEncoder(module)
コード例 #12
0
ファイル: __init__.py プロジェクト: mmazab/LifeQA
 def from_params(self, params: Params) -> PytorchSeq2VecWrapper:
     if not params.pop('batch_first', True):
         raise ConfigurationError(
             "Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params['batch_first'] = True
     return_all_layers = params.pop('return_all_layers', False)
     return_all_hidden_states = params.pop('return_all_hidden_states',
                                           False)
     module = self._module_class(**params.as_dict())
     return PytorchSeq2VecWrapper(
         module,
         return_all_layers=return_all_layers,
         return_all_hidden_states=return_all_hidden_states)
コード例 #13
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
    def test_embedding_vocab_extension_works_with_pretrained_embedding_file(
            self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1")
        vocab.add_token_to_namespace("word2")

        embeddings_filename = str(self.TEST_DIR / "embeddings2.gz")
        with gzip.open(embeddings_filename, "wb") as embeddings_file:
            embeddings_file.write("word3 0.5 0.3 -6.0\n".encode("utf-8"))
            embeddings_file.write("word4 1.0 2.3 -1.0\n".encode("utf-8"))
            embeddings_file.write("word2 0.1 0.4 -4.0\n".encode("utf-8"))
            embeddings_file.write("word1 1.0 2.3 -1.0\n".encode("utf-8"))

        embedding_params = Params({
            "vocab_namespace": "tokens",
            "embedding_dim": 3,
            "pretrained_file": embeddings_filename,
        })
        embedder = Embedding.from_vocab_or_file(
            vocab, **embedding_params.as_dict(quiet=True))

        # Change weight to simulate embedding training
        embedder.weight.data += 1
        assert torch.all(embedder.weight[2:, :] == torch.Tensor(
            [[2.0, 3.3, 0.0], [1.1, 1.4, -3.0]]))
        original_weight = embedder.weight

        assert tuple(original_weight.size()) == (
            4, 3)  # 4 because of padding and OOV

        vocab.add_token_to_namespace("word3")
        embedder.extend_vocab(
            vocab,
            extension_pretrained_file=embeddings_filename)  # default namespace
        extended_weight = embedder.weight

        # Make sure extenstion happened for extra token in extended vocab
        assert tuple(extended_weight.size()) == (5, 3)

        # Make sure extension doesn't change original trained weights.
        assert torch.all(original_weight[:4, :] == extended_weight[:4, :])

        # Make sure extended weight is taken from the embedding file.
        assert torch.all(
            extended_weight[4, :] == torch.Tensor([0.5, 0.3, -6.0]))
コード例 #14
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, "w") as fout:
            _ = fout.create_dataset("embedding",
                                    embeddings.shape,
                                    dtype="float32",
                                    data=embeddings)

        params = Params({
            "pretrained_file": embeddings_filename,
            "embedding_dim": 5
        })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_vocab_or_file(vocab,
                                             **params.as_dict(quiet=True))
コード例 #15
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(
         self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, "wb") as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
     params = Params({
         "pretrained_file": embeddings_filename,
         "embedding_dim": 3
     })
     embedding_layer = Embedding.from_vocab_or_file(
         vocab, **params.as_dict(quiet=True))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([0.0, 0.0, 0.0]))
コード例 #16
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, "w") as fout:
            _ = fout.create_dataset("embedding",
                                    embeddings.shape,
                                    dtype="float32",
                                    data=embeddings)

        params = Params({
            "pretrained_file": embeddings_filename,
            "embedding_dim": 5
        })
        embedding_layer = Embedding.from_vocab_or_file(
            vocab, **params.as_dict(quiet=True))
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
コード例 #17
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("the")
        vocab.add_token_to_namespace("a")
        params = Params({
            "pretrained_file":
            str(self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz"),
            "embedding_dim":
            300,
            "projection_dim":
            20,
        })
        embedding_layer = Embedding.from_vocab_or_file(
            vocab, **params.as_dict(quiet=True))
        input_tensor = torch.LongTensor([[3, 2, 1, 0]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
コード例 #18
0
ファイル: embedding_test.py プロジェクト: vin-ivar/allennlp
    def test_embedding_vocab_extension_with_default_namespace(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word1")
        vocab.add_token_to_namespace("word2")
        embedding_params = Params({
            "vocab_namespace": "tokens",
            "embedding_dim": 10
        })
        embedder = Embedding.from_vocab_or_file(
            vocab, **embedding_params.as_dict(quiet=True))
        original_weight = embedder.weight

        assert original_weight.shape[0] == 4

        extension_counter = {"tokens": {"word3": 1}}
        vocab._extend(extension_counter)

        embedder.extend_vocab(vocab)  # default namespace

        extended_weight = embedder.weight
        assert extended_weight.shape[0] == 5
        assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
コード例 #19
0
ファイル: __init__.py プロジェクト: sethah/allencv
 def from_params(self, params: Params):
     transform = self._transform_class(**params.as_dict())
     return ImageTransform(transform)
コード例 #20
0
class ArchivalTest(AllenNlpTestCase):
    def setUp(self):
        super().setUp()

        self.params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                }
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
            "validation_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam",
            }
        })

    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / 'archive_test'
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        assert_models_equal(model, model2)

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_archive_model_uses_archive_path(self):

        serialization_dir = self.TEST_DIR / 'serialization'
        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)
        # Use a new path.
        archive_model(serialization_dir=serialization_dir,
                      archive_path=serialization_dir / "new_path.tar.gz")
        archive = load_archive(serialization_dir / 'new_path.tar.gz')
        assert archive

    def test_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {
            "train_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
        }
        archive_model(serialization_dir=serialization_dir,
                      files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir / 'model.tar.gz')
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The temporary path should be accessible even after the load_archive
        # function returns.
        assert os.path.exists(params.get('train_data_path'))

        # The validation data path should be the same though.
        assert params.get('validation_data_path') == str(
            self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')

    def test_loading_serialization_directory(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / 'serialization'
        model = train_model(self.params, serialization_dir=serialization_dir)

        # load from the serialization directory itself
        archive = load_archive(serialization_dir)
        model2 = archive.model

        assert_models_equal(model, model2)

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_loading_serialization_directory_with_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        original_train_data_path = str(self.FIXTURES_ROOT / 'data' /
                                       'sequence_tagging.tsv')
        files_to_archive = {"train_data_path": original_train_data_path}
        archive_model(serialization_dir=serialization_dir,
                      files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir)
        params = archive.config

        # We're loading from a directory, so retain the original path.
        assert params.get('train_data_path') == original_train_data_path
コード例 #21
0
ファイル: archival_test.py プロジェクト: pyknife/allennlp
class ArchivalTest(AllenNlpTestCase):
    def setUp(self):
        super().setUp()

        self.params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam",
                }
        })

    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / 'archive_test'
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        # check that model weights are the same
        keys = set(model.state_dict().keys())
        keys2 = set(model2.state_dict().keys())

        assert keys == keys2

        for key in keys:
            assert torch.equal(model.state_dict()[key], model2.state_dict()[key])

        # check that vocabularies are the same
        vocab = model.vocab
        vocab2 = model2.vocab

        assert vocab._token_to_index == vocab2._token_to_index  # pylint: disable=protected-access
        assert vocab._index_to_token == vocab2._index_to_token  # pylint: disable=protected-access

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {"train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')}
        archive_model(serialization_dir=serialization_dir, files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir / 'model.tar.gz')
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The validation data path should be the same though.
        assert params.get('validation_data_path') == str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
コード例 #22
0
class ArchivalTest(AllenNlpTestCase):
    def setUp(self):
        super().setUp()

        self.params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                }
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
            "validation_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam",
            }
        })

    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / 'archive_test'
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        # check that model weights are the same
        keys = set(model.state_dict().keys())
        keys2 = set(model2.state_dict().keys())

        assert keys == keys2

        for key in keys:
            assert torch.equal(model.state_dict()[key],
                               model2.state_dict()[key])

        # check that vocabularies are the same
        vocab = model.vocab
        vocab2 = model2.vocab

        assert vocab._token_to_index == vocab2._token_to_index  # pylint: disable=protected-access
        assert vocab._index_to_token == vocab2._index_to_token  # pylint: disable=protected-access

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_extra_files(self):

        serialization_dir = self.TEST_DIR / 'serialization'

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {
            "train_data_path":
            str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
        }
        archive_model(serialization_dir=serialization_dir,
                      files_to_archive=files_to_archive)

        archive = load_archive(serialization_dir / 'model.tar.gz')
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The temporary path should be accessible even after the load_archive
        # function returns.
        assert os.path.exists(params.get('train_data_path'))

        # The validation data path should be the same though.
        assert params.get('validation_data_path') == str(
            self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
コード例 #23
0
ファイル: optimizers.py プロジェクト: wjn922/allennlp
    def from_params(cls, model_parameters: List, params: Params):  # type: ignore
        # pylint: disable=arguments-differ
        if isinstance(params, str):
            optimizer = params
            params = Params({})
        else:
            optimizer = params.pop_choice("type", Optimizer.list_available())

        # make the parameter groups if need
        groups = params.pop("parameter_groups", None)
        if groups:
            # The input to the optimizer is list of dict.
            # Each dict contains a "parameter group" and groups specific options,
            # e.g., {'params': [list of parameters], 'lr': 1e-3, ...}
            # Any config option not specified in the additional options (e.g.
            # for the default group) is inherited from the top level config.
            # see: https://pytorch.org/docs/0.3.0/optim.html?#per-parameter-options
            #
            # groups contains something like:
            #"parameter_groups": [
            #       [["regex1", "regex2"], {"lr": 1e-3}],
            #       [["regex3"], {"lr": 1e-4}]
            #]
            #(note that the allennlp config files require double quotes ", and will
            # fail (sometimes silently) with single quotes ').

            # This is typed as as Any since the dict values other then
            # the params key are passed to the Optimizer constructor and
            # can be any type it accepts.
            # In addition to any parameters that match group specific regex,
            # we also need a group for the remaining "default" group.
            # Those will be included in the last entry of parameter_groups.
            parameter_groups: Any = [{'params': []} for _ in range(len(groups) + 1)]
            # add the group specific kwargs
            for k in range(len(groups)): # pylint: disable=consider-using-enumerate
                parameter_groups[k].update(groups[k][1].as_dict())

            regex_use_counts: Dict[str, int] = {}
            parameter_group_names: List[set] = [set() for _ in range(len(groups) + 1)]
            for name, param in model_parameters:
                # Determine the group for this parameter.
                group_index = None
                for k, group_regexes in enumerate(groups):
                    for regex in group_regexes[0]:
                        if regex not in regex_use_counts:
                            regex_use_counts[regex] = 0
                        if re.search(regex, name):
                            if group_index is not None and group_index != k:
                                raise ValueError("{} was specified in two separate parameter groups".format(name))
                            group_index = k
                            regex_use_counts[regex] += 1

                if group_index is not None:
                    parameter_groups[group_index]['params'].append(param)
                    parameter_group_names[group_index].add(name)
                else:
                    # the default group
                    parameter_groups[-1]['params'].append(param)
                    parameter_group_names[-1].add(name)

            # log the parameter groups
            logger.info("Done constructing parameter groups.")
            for k in range(len(groups) + 1):
                group_options = {key: val for key, val in parameter_groups[k].items()
                                 if key != 'params'}
                logger.info("Group %s: %s, %s", k,
                            list(parameter_group_names[k]),
                            group_options)
            # check for unused regex
            for regex, count in regex_use_counts.items():
                if count == 0:
                    logger.warning("When constructing parameter groups, "
                                   " %s not match any parameter name", regex)

        else:
            parameter_groups = [param for name, param in model_parameters]

        # Log the number of parameters to optimize
        num_parameters = 0
        for parameter_group in parameter_groups:
            if isinstance(parameter_group, dict):
                num_parameters += sum(parameter.numel() for parameter in parameter_group["params"])
            else:
                num_parameters += parameter_group.numel()
        logger.info("Number of trainable parameters: %s", num_parameters)

        # By default we cast things that e.g. look like floats to floats before handing them
        # to the Optimizer constructor, but if you want to disable that behavior you could add a
        #       "infer_type_and_cast": false
        # key to your "trainer.optimizer" config.
        infer_type_and_cast = params.pop_bool("infer_type_and_cast", True)
        params_as_dict = params.as_dict(infer_type_and_cast=infer_type_and_cast)
        subclass = Optimizer.by_name(optimizer)

        # If the optimizer subclass has a from_params, use it.
        if hasattr(subclass, 'from_params'):
            return subclass.from_params(parameter_groups, params=params)
        else:
            return subclass(parameter_groups, **params_as_dict) # type: ignore
コード例 #24
0
    def from_params(  # type: ignore
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        local_rank: int = 0,
    ) -> "Trainer":

        from allennlp.training.trainer import Trainer
        from allennlp.training.trainer_pieces import TrainerPieces

        config = dict(as_flat_dict(params.as_dict()))
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)
        model = pieces.model
        serialization_dir = serialization_dir
        iterator = pieces.iterator
        train_data = pieces.train_dataset
        validation_data = pieces.validation_dataset
        params = pieces.params
        validation_iterator = pieces.validation_iterator

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters
            )
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if "checkpointer" in params:
            if (
                "keep_serialized_model_every_num_seconds" in params
                or "num_serialized_models_to_keep" in params
            ):
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods."
                )
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None
            )
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds,
            )
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate", False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        distributed = params.pop_bool("distributed", False)
        world_size = params.pop_int("world_size", 1)

        num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1)
        lang_mean_dir = params.pop("ft_lang_mean_dir", None)
        if lang_mean_dir:
            try:
                assert model._lang_means is not None
                lang_mean = get_lang_mean(lang_mean_dir)
                model.add_ft_lang_mean_to_lang_means(lang_mean)
            except (AttributeError, AssertionError) as e:
                pass

        writer = None
        wandb_config = params.pop("wandb", None)
        if wandb_config is not None:
            writer = WandBWriter(config, model, wandb_config)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            distributed=distributed,
            local_rank=local_rank,
            world_size=world_size,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
            writer=writer,
        )
コード例 #25
0
class ArchivalTest(AllenNlpTestCase):
    def setUp(self):
        super().setUp()

        self.params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "validation_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam"
            },
        })

    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / "archive_test"
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        assert_models_equal(model, model2)

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_archive_model_uses_archive_path(self):

        serialization_dir = self.TEST_DIR / "serialization"
        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)
        # Use a new path.
        archive_model(serialization_dir=serialization_dir,
                      archive_path=serialization_dir / "new_path.tar.gz")
        archive = load_archive(serialization_dir / "new_path.tar.gz")
        assert archive

    def test_loading_serialization_directory(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / "serialization"
        model = train_model(self.params, serialization_dir=serialization_dir)

        # load from the serialization directory itself
        archive = load_archive(serialization_dir)
        model2 = archive.model

        assert_models_equal(model, model2)

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy
コード例 #26
0
    def test_archiving(self):
        super(ArchivalTest, self).setUp()

        params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "tokens": {
                        "type": "embedding",
                        "embedding_dim": 5
                    }
                },
                "stacked_encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                }
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
            "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam",
            }
        })

        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(params.as_dict())

        # `train_model` should create an archive
        model = train_model(params, serialization_dir=self.TEST_DIR)

        archive_path = os.path.join(self.TEST_DIR, "model.tar.gz")

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        # check that model weights are the same
        keys = set(model.state_dict().keys())
        keys2 = set(model2.state_dict().keys())

        assert keys == keys2

        for key in keys:
            assert torch.equal(model.state_dict()[key],
                               model2.state_dict()[key])

        # check that vocabularies are the same
        vocab = model.vocab
        vocab2 = model2.vocab

        assert vocab._token_to_index == vocab2._token_to_index  # pylint: disable=protected-access
        assert vocab._index_to_token == vocab2._index_to_token  # pylint: disable=protected-access

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy
コード例 #27
0
ファイル: optimizers.py プロジェクト: apmoore1/allennlp
    def from_params(cls, model_parameters: List, params: Params):  # type: ignore
        # pylint: disable=arguments-differ
        if isinstance(params, str):
            optimizer = params
            params = Params({})
        else:
            optimizer = params.pop_choice("type", Optimizer.list_available())

        # make the parameter groups if need
        groups = params.pop("parameter_groups", None)
        if groups:
            # The input to the optimizer is list of dict.
            # Each dict contains a "parameter group" and groups specific options,
            # e.g., {'params': [list of parameters], 'lr': 1e-3, ...}
            # Any config option not specified in the additional options (e.g.
            # for the default group) is inherited from the top level config.
            # see: http://pytorch.org/docs/0.3.0/optim.html?#per-parameter-options
            #
            # groups contains something like:
            #"parameter_groups": [
            #       [["regex1", "regex2"], {"lr": 1e-3}],
            #       [["regex3"], {"lr": 1e-4}]
            #]
            #(note that the allennlp config files require double quotes ", and will
            # fail (sometimes silently) with single quotes ').

            # This is typed as as Any since the dict values other then
            # the params key are passed to the Optimizer constructor and
            # can be any type it accepts.
            # In addition to any parameters that match group specific regex,
            # we also need a group for the remaining "default" group.
            # Those will be included in the last entry of parameter_groups.
            parameter_groups: Any = [{'params': []} for _ in range(len(groups) + 1)]
            # add the group specific kwargs
            for k in range(len(groups)): # pylint: disable=consider-using-enumerate
                parameter_groups[k].update(groups[k][1].as_dict())

            regex_use_counts: Dict[str, int] = {}
            parameter_group_names: List[set] = [set() for _ in range(len(groups) + 1)]
            for name, param in model_parameters:
                # Determine the group for this parameter.
                group_index = None
                for k, group_regexes in enumerate(groups):
                    for regex in group_regexes[0]:
                        if regex not in regex_use_counts:
                            regex_use_counts[regex] = 0
                        if re.search(regex, name):
                            if group_index is not None and group_index != k:
                                raise ValueError("{} was specified in two separate parameter groups".format(name))
                            group_index = k
                            regex_use_counts[regex] += 1

                if group_index is not None:
                    parameter_groups[group_index]['params'].append(param)
                    parameter_group_names[group_index].add(name)
                else:
                    # the default group
                    parameter_groups[-1]['params'].append(param)
                    parameter_group_names[-1].add(name)

            # log the parameter groups
            logger.info("Done constructing parameter groups.")
            for k in range(len(groups) + 1):
                group_options = {key: val for key, val in parameter_groups[k].items()
                                 if key != 'params'}
                logger.info("Group %s: %s, %s", k,
                            list(parameter_group_names[k]),
                            group_options)
            # check for unused regex
            for regex, count in regex_use_counts.items():
                if count == 0:
                    logger.warning("When constructing parameter groups, "
                                   " %s not match any parameter name", regex)

        else:
            parameter_groups = [param for name, param in model_parameters]

        # Log the number of parameters to optimize
        num_parameters = 0
        for parameter_group in parameter_groups:
            if isinstance(parameter_group, dict):
                num_parameters += sum(parameter.numel() for parameter in parameter_group["params"])
            else:
                num_parameters += parameter_group.numel()
        logger.info("Number of trainable parameters: %s", num_parameters)

        # By default we cast things that e.g. look like floats to floats before handing them
        # to the Optimizer constructor, but if you want to disable that behavior you could add a
        #       "infer_type_and_cast": false
        # key to your "trainer.optimizer" config.
        infer_type_and_cast = params.pop_bool("infer_type_and_cast", True)
        params_as_dict = params.as_dict(infer_type_and_cast=infer_type_and_cast)
        return Optimizer.by_name(optimizer)(parameter_groups, **params_as_dict) # type: ignore
コード例 #28
0
 def from_params(self, params: Params) -> CrossValidationSplitter:
     generate_validation_sets = params.pop_bool('generate_validation_sets',
                                                False)
     cross_validator = self.cross_validator_class(**params.as_dict())
     return CrossValidationSplitter(
         cross_validator, generate_validation_sets=generate_validation_sets)
コード例 #29
0
def make_files_for_official_eval(model_archive_file, evaluation_files, output_file,
                                 cuda_device):

    archive = load_archive(model_archive_file)
    model = archive.model

    model.eval()
    if cuda_device != -1:
        model.cuda(cuda_device)

    def find_key(d, func):
        ret = None
        stack = [d]
        while len(stack) > 0 and ret is None:
            s = stack.pop()
            for k, v in s.items():
                if func(k, v):
                    ret = s
                    break
                elif isinstance(v, dict):
                    stack.append(v)
        return ret

    # load reader
    full_reader_params = copy.deepcopy(archive.config['dataset_reader'].as_dict())
    reader_params = find_key(full_reader_params,
                             lambda k, v: k == 'type' and v == 'wordnet_fine_grained')
    reader_params['is_training'] = False
    reader_params['should_remap_span_indices'] = True
    if 'extra_candidate_generators' in reader_params:
        candidate_generator_params = find_key(
                full_reader_params,
                lambda k, v: k == 'tokenizer_and_candidate_generator'
        )['tokenizer_and_candidate_generator']
        candidate_generator = TokenizerAndCandidateGenerator.from_params(
                Params(candidate_generator_params)
        )

    reader_params = Params(reader_params)

    print("====================")
    print(reader_params.as_dict())
    print("====================")

    reader = DatasetReader.from_params(reader_params)

    synset_to_lemmas = {}
    for lemma_id, synset_id in reader.mention_generator._lemma_to_synset.items():
        if synset_id not in synset_to_lemmas:
            synset_to_lemmas[synset_id] = []
        synset_to_lemmas[synset_id].append(lemma_id)

    vocab_params = archive.config['vocabulary']
    vocab = Vocabulary.from_params(vocab_params)

    iterator = BasicIterator(batch_size=24)
    iterator.index_with(vocab)

    fout = open(output_file, 'w')

    for ds_file in [evaluation_file]:
        instances = reader.read(ds_file)

        # get the metadata ids from the raw file
        raw_lines = []
        with JsonFile(ds_file, 'r') as fin:
            for sentence in fin:
                raw_ids = [[token['id'], token['lemma']] for token in sentence if 'senses' in token]
                if len(raw_ids) > 0:
                    raw_lines.append(raw_ids)

        raw_i = 0
        for batch in iterator(instances, num_epochs=1, shuffle=False):
            print(raw_i)

            if cuda_device > -1:
                b = move_to_device(batch, cuda_device)
            else:
                b = batch

            b['candidates'] = {'wordnet': {
                    'candidate_entities': b.pop('candidate_entities'),
                    'candidate_entity_priors': b.pop('candidate_entity_prior'),
                    'candidate_segment_ids': b.pop('candidate_segment_ids'),
                    'candidate_spans': b.pop('candidate_spans')}}
            gold_entities = b.pop('gold_entities')
            b['gold_entities'] = {'wordnet': gold_entities}

            if 'extra_candidates' in b:
                extra_candidates = b.pop('extra_candidates')
                seq_len = b['tokens']['tokens'].shape[1]
                bbb = []
                for e in extra_candidates:
                    for k in e.keys():
                        e[k]['candidate_segment_ids'] = [0] * len(e[k]['candidate_spans'])
                    ee = {'tokens': ['[CLS]'] * seq_len, 'segment_ids': [0] * seq_len,
                          'candidates': e}
                    ee_fields = candidate_generator.convert_tokens_candidates_to_fields(ee)
                    bbb.append(Instance(ee_fields))
                eb = Batch(bbb)
                eb.index_instances(vocab)
                padding_lengths = eb.get_padding_lengths()
                tensor_dict = eb.as_tensor_dict(padding_lengths)
                b['candidates'].update(tensor_dict['candidates'])

            if cuda_device > -1:
                b = move_to_device(b, cuda_device)

            output = model(**b)
    
            # predicted entities is list of (batch_index, (start, end), entity_id)
            predicted_entities = model.soldered_kgs['wordnet'].entity_linker._decode(
                          output['wordnet']['linking_scores'], b['candidates']['wordnet']['candidate_spans'], 
                          b['candidates']['wordnet']['candidate_entities']['ids']
            )

            # make output file
            predicted_entities_batch_indices = []
            batch_size = batch['tokens']['tokens'].shape[0]
            for k in range(batch_size):
                predicted_entities_batch_indices.append([])
            for b_index, start_end, eid in predicted_entities:
                try:
                    synset_id = vocab.get_token_from_index(eid, 'entity')
                except KeyError:
                    synset_id = vocab.get_token_from_index(eid, 'entity_wordnet')
                all_lemma_ids = synset_to_lemmas[synset_id]
                predicted_entities_batch_indices[b_index].append(all_lemma_ids)

            # output lines look like semeval2013.d000.s001.t003 reader%1:19:00::
            for k in range(batch_size):
                raw_ids = raw_lines[raw_i]
                predicted_lemmas = predicted_entities_batch_indices[k]
                assert len(predicted_lemmas) == len(raw_ids)
                for (ii, gold_lemma), pl in zip(raw_ids, predicted_lemmas):
                    # get the predicted lemma_id
                    predicted_lemma_id = None
                    for pp in pl:
                        if pp.partition('%')[0] == gold_lemma:
                            predicted_lemma_id = pp
                    assert predicted_lemma_id is not None
                    line = "{} {}\n".format(ii, predicted_lemma_id)
                    fout.write(line)
                raw_i += 1

    fout.close()
コード例 #30
0
ファイル: optimizers.py プロジェクト: uganyasavur/allennlp
    def from_params(cls, model_parameters: List, params: Params):
        if isinstance(params, str):
            optimizer = params
            params = Params({})
        else:
            optimizer = params.pop_choice("type", Optimizer.list_available())

        # make the parameter groups if need
        groups = params.pop("parameter_groups", None)
        if groups:
            # input to optimizer is list of dict
            # each dict contains {'params': [list of parameters], 'lr': 1e-3, ...}
            # Any config option not specified in the additional options (e.g.
            # for the default group) is inherited from the top level config.
            # see: http://pytorch.org/docs/0.3.0/optim.html?#per-parameter-options
            #
            # groups contains something like:
            #"parameter_groups": [
            #       [['regex1', 'regex2'], {'lr': 1e-3},
            #        ['regex3'], {'lr': 1e-4}]
            #]
            #
            # The last entry of this list is for the parameters not in any regex.
            #
            # This is typed as as Any since the dict values other then
            # the params key are passed to the Optimizer constructor and
            # can be any type it accepts.
            parameter_groups: Any = [{
                'params': []
            } for _ in range(len(groups) + 1)]
            # add the group specific kwargs
            for k in range(len(groups)):  # pylint: disable=consider-using-enumerate
                parameter_groups[k].update(groups[k][1].as_dict())

            regex_use_counts: Dict[str, int] = {}
            parameter_group_names: List[set] = [
                set() for _ in range(len(groups) + 1)
            ]
            for name, param in model_parameters:
                # Determine the group for this parameter.
                group_index = None
                for k, group_regexes in enumerate(groups):
                    for regex in group_regexes[0]:
                        if regex not in regex_use_counts:
                            regex_use_counts[regex] = 0
                        if re.search(regex, name):
                            if group_index is not None and group_index != k:
                                raise ValueError(
                                    "{} was specified in two separate parameter groups"
                                    .format(name))
                            group_index = k
                            regex_use_counts[regex] += 1

                if group_index is not None:
                    parameter_groups[group_index]['params'].append(param)
                    parameter_group_names[group_index].add(name)
                else:
                    # the default group
                    parameter_groups[-1]['params'].append(param)
                    parameter_group_names[-1].add(name)

            # log the parameter groups
            logger.info("Done constructing parameter groups.")
            for k in range(len(groups) + 1):
                group_options = {
                    key: val
                    for key, val in parameter_groups[k].items()
                    if key != 'params'
                }
                print("Group {0}: {1}, {2}".format(
                    k, list(parameter_group_names[k]), group_options))
            # check for unused regex
            for regex, count in regex_use_counts.items():
                if count == 0:
                    logger.warning(
                        "When constructing parameter groups, "
                        " %s not match any parameter name", regex)

        else:
            parameter_groups = [param for name, param in model_parameters]

        return Optimizer.by_name(optimizer)(parameter_groups,
                                            **params.as_dict())  # type: ignore
コード例 #31
0
ファイル: optimizers.py プロジェクト: ryan-leung/ml_monorepo
    def from_params(cls, model_parameters      , params        ):  # type: ignore
        # pylint: disable=arguments-differ
        if isinstance(params, unicode):
            optimizer = params
            params = Params({})
        else:
            optimizer = params.pop_choice(u"type", Optimizer.list_available())

        # make the parameter groups if need
        groups = params.pop(u"parameter_groups", None)
        if groups:
            # The input to the optimizer is list of dict.
            # Each dict contains a "parameter group" and groups specific options,
            # e.g., {'params': [list of parameters], 'lr': 1e-3, ...}
            # Any config option not specified in the additional options (e.g.
            # for the default group) is inherited from the top level config.
            # see: http://pytorch.org/docs/0.3.0/optim.html?#per-parameter-options
            #
            # groups contains something like:
            #"parameter_groups": [
            #       [["regex1", "regex2"], {"lr": 1e-3},
            #        ["regex3"], {"lr": 1e-4}]
            #]
            #(note that the allennlp config files require double quotes ", and will
            # fail (sometimes silently) with single quotes ').

            # This is typed as as Any since the dict values other then
            # the params key are passed to the Optimizer constructor and
            # can be any type it accepts.
            # In addition to any parameters that match group specific regex,
            # we also need a group for the remaining "default" group.
            # Those will be included in the last entry of parameter_groups.
            parameter_groups      = [{u'params': []} for _ in range(len(groups) + 1)]
            # add the group specific kwargs
            for k in range(len(groups)): # pylint: disable=consider-using-enumerate
                parameter_groups[k].update(groups[k][1].as_dict())

            regex_use_counts                 = {}
            parameter_group_names            = [set() for _ in range(len(groups) + 1)]
            for name, param in model_parameters:
                # Determine the group for this parameter.
                group_index = None
                for k, group_regexes in enumerate(groups):
                    for regex in group_regexes[0]:
                        if regex not in regex_use_counts:
                            regex_use_counts[regex] = 0
                        if re.search(regex, name):
                            if group_index is not None and group_index != k:
                                raise ValueError(u"{} was specified in two separate parameter groups".format(name))
                            group_index = k
                            regex_use_counts[regex] += 1

                if group_index is not None:
                    parameter_groups[group_index][u'params'].append(param)
                    parameter_group_names[group_index].add(name)
                else:
                    # the default group
                    parameter_groups[-1][u'params'].append(param)
                    parameter_group_names[-1].add(name)

            # log the parameter groups
            logger.info(u"Done constructing parameter groups.")
            for k in range(len(groups) + 1):
                group_options = dict((key, val) for key, val in list(parameter_groups[k].items())
                                 if key != u'params')
                logger.info(u"Group %s: %s, %s", k,
                            list(parameter_group_names[k]),
                            group_options)
            # check for unused regex
            for regex, count in list(regex_use_counts.items()):
                if count == 0:
                    logger.warning(u"When constructing parameter groups, "
                                   u" %s not match any parameter name", regex)

        else:
            parameter_groups = [param for name, param in model_parameters]

        # Log the number of parameters to optimize
        num_parameters = 0
        for parameter_group in parameter_groups:
            if isinstance(parameter_group, dict):
                num_parameters += sum(parameter.numel() for parameter in parameter_group[u"params"])
            else:
                num_parameters += parameter_group.numel()
        logger.info(u"Number of trainable parameters: %s", num_parameters)
        return Optimizer.by_name(optimizer)(parameter_groups, **params.as_dict()) # type: ignore
コード例 #32
0
ファイル: archival_test.py プロジェクト: solversa/allennlp
class ArchivalTest(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()

        self.params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "validation_data_path":
            str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
            "data_loader": {
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam"
            },
        })

    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / "archive_test"
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        assert_models_equal(model, model2)

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_archive_model_uses_archive_path(self):

        serialization_dir = self.TEST_DIR / "serialization"
        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)
        # Use a new path.
        archive_model(serialization_dir=serialization_dir,
                      archive_path=serialization_dir / "new_path.tar.gz")
        archive = load_archive(serialization_dir / "new_path.tar.gz")
        assert archive

    def test_loading_serialization_directory(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / "serialization"
        model = train_model(self.params, serialization_dir=serialization_dir)

        # load from the serialization directory itself
        archive = load_archive(serialization_dir)
        model2 = archive.model

        assert_models_equal(model, model2)

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_can_load_from_archive_model(self):
        serialization_dir = self.FIXTURES_ROOT / "basic_classifier" / "from_archive_serialization"
        archive_path = serialization_dir / "model.tar.gz"
        model = load_archive(archive_path).model

        # We want to be sure that we don't just not crash, but also be sure that we loaded the right
        # weights for the model.  We'll do that by making sure that we didn't just load the model
        # that's in the `archive_path` of the config file, which is this one.
        base_model_path = self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz"
        base_model = load_archive(base_model_path).model
        base_model_params = dict(base_model.named_parameters())
        for name, parameters in model.named_parameters():
            if parameters.size() == base_model_params[name].size():
                assert not (parameters == base_model_params[name]).all()
            else:
                # In this case, the parameters are definitely different, no need for the above
                # check.
                pass
コード例 #33
0
ファイル: archival_test.py プロジェクト: himkt/allennlp
class ArchivalTest(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()

        self.params = Params(
            {
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
                    },
                    "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
                "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
                "data_loader": {"batch_size": 2},
                "trainer": {"num_epochs": 2, "optimizer": "adam", "cuda_device": -1},
            }
        )

    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = self.params.duplicate()
        params_dict_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / "archive_test"
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        assert_models_equal(model, model2)

        assert isinstance(
            archive.dataset_reader,
            type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())),
        )
        assert isinstance(
            archive.validation_dataset_reader,
            type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())),
        )  # validation_dataset_reader is not in the config, so fall back to dataset_reader

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_dict_copy

    def test_archive_model_uses_archive_path(self):

        serialization_dir = self.TEST_DIR / "serialization"
        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)
        # Use a new path.
        archive_model(
            serialization_dir=serialization_dir, archive_path=serialization_dir / "new_path.tar.gz"
        )
        archive = load_archive(serialization_dir / "new_path.tar.gz")
        assert archive

    def test_loading_serialization_directory(self):
        # copy params, since they'll get consumed during training
        params_dict_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / "serialization"
        model = train_model(self.params, serialization_dir=serialization_dir)

        # load from the serialization directory itself
        archive = load_archive(serialization_dir)
        model2 = archive.model

        assert_models_equal(model, model2)

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_dict_copy

    def test_can_load_from_archive_model(self):
        serialization_dir = self.FIXTURES_ROOT / "basic_classifier" / "from_archive_serialization"
        archive_path = serialization_dir / "model.tar.gz"
        model = load_archive(archive_path).model

        # We want to be sure that we don't just not crash, but also be sure that we loaded the right
        # weights for the model.  We'll do that by making sure that we didn't just load the model
        # that's in the `archive_path` of the config file, which is this one.
        base_model_path = self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz"
        base_model = load_archive(base_model_path).model
        base_model_params = dict(base_model.named_parameters())
        for name, parameters in model.named_parameters():
            if parameters.size() == base_model_params[name].size():
                assert not (parameters == base_model_params[name]).all()
            else:
                # In this case, the parameters are definitely different, no need for the above
                # check.
                pass

    def test_include_in_archive(self):
        self.params["include_in_archive"] = ["metrics_epoch_*.json"]

        serialization_dir = self.TEST_DIR / "serialization"
        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Assert that the additional targets were archived
        with tempfile.TemporaryDirectory() as tempdir:
            with tarfile.open(serialization_dir / "model.tar.gz", "r:gz") as archive:
                archive.extractall(tempdir)
            assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_0.json"))
            assert os.path.isfile(os.path.join(tempdir, "metrics_epoch_1.json"))
            assert not os.path.isfile(os.path.join(tempdir, "metrics.json"))

    def test_invalid_include_in_archive(self):
        self.params["include_in_archive"] = [CONFIG_NAME]

        serialization_dir = self.TEST_DIR / "serialization"

        with pytest.raises(ConfigurationError) as exc:
            train_model(self.params, serialization_dir=serialization_dir)
            assert "are saved names and cannot be used" in str(exc.value)