コード例 #1
0
ファイル: train.py プロジェクト: Jordan-Sauchuk/allennlp
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
コード例 #2
0
ファイル: train.py プロジェクト: Jordan-Sauchuk/allennlp
def create_serialization_dir(params: Params, serialization_dir: str, recover: bool) -> None:
    """
    This function creates the serialization directory if it doesn't exist.  If it already exists,
    then it verifies that we're recovering from a training with an identical configuration.

    Parameters
    ----------
    params: ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: ``str``
        The directory in which to save results and logs.
    recover: ``bool``
        If ``True``, we will try to recover from an existing serialization directory, and crash if
        the directory doesn't exist, or doesn't match the configuration we're given.
    """
    if os.path.exists(serialization_dir):
        if serialization_dir == '/output':
            # Special-casing the beaker output directory, which will already exist when training
            # starts.
            return
        if not recover:
            raise ConfigurationError(f"Serialization directory ({serialization_dir}) already exists.  "
                                     f"Specify --recover to recover training from existing output.")

        logger.info(f"Recovering from prior training at {serialization_dir}.")

        recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME)
        if not os.path.exists(recovered_config_file):
            raise ConfigurationError("The serialization directory already exists but doesn't "
                                     "contain a config.json. You probably gave the wrong directory.")
        else:
            loaded_params = Params.from_file(recovered_config_file)

            # Check whether any of the training configuration differs from the configuration we are
            # resuming.  If so, warn the user that training may fail.
            fail = False
            flat_params = params.as_flat_dict()
            flat_loaded = loaded_params.as_flat_dict()
            for key in flat_params.keys() - flat_loaded.keys():
                logger.error(f"Key '{key}' found in training configuration but not in the serialization "
                             f"directory we're recovering from.")
                fail = True
            for key in flat_loaded.keys() - flat_params.keys():
                logger.error(f"Key '{key}' found in the serialization directory we're recovering from "
                             f"but not in the training config.")
                fail = True
            for key in flat_params.keys():
                if flat_params.get(key, None) != flat_loaded.get(key, None):
                    logger.error(f"Value for '{key}' in training configuration does not match that the value in "
                                 f"the serialization directory we're recovering from: "
                                 f"{flat_params[key]} != {flat_loaded[key]}")
                    fail = True
            if fail:
                raise ConfigurationError("Training configuration does not match the configuration we're "
                                         "recovering from.")
    else:
        if recover:
            raise ConfigurationError(f"--recover specified but serialization_dir ({serialization_dir}) "
                                     "does not exist.  There is nothing to recover from.")
        os.makedirs(serialization_dir)
コード例 #3
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttention':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)

        premise_encoder_params = params.pop("premise_encoder", None)
        if premise_encoder_params is not None:
            premise_encoder = Seq2SeqEncoder.from_params(premise_encoder_params)
        else:
            premise_encoder = None

        hypothesis_encoder_params = params.pop("hypothesis_encoder", None)
        if hypothesis_encoder_params is not None:
            hypothesis_encoder = Seq2SeqEncoder.from_params(hypothesis_encoder_params)
        else:
            hypothesis_encoder = None

        attend_feedforward = FeedForward.from_params(params.pop('attend_feedforward'))
        similarity_function = SimilarityFunction.from_params(params.pop("similarity_function"))
        compare_feedforward = FeedForward.from_params(params.pop('compare_feedforward'))
        aggregate_feedforward = FeedForward.from_params(params.pop('aggregate_feedforward'))
        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))

        params.assert_empty(cls.__name__)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   attend_feedforward=attend_feedforward,
                   similarity_function=similarity_function,
                   compare_feedforward=compare_feedforward,
                   aggregate_feedforward=aggregate_feedforward,
                   premise_encoder=premise_encoder,
                   hypothesis_encoder=hypothesis_encoder,
                   initializer=initializer,
                   regularizer=regularizer)
コード例 #4
0
ファイル: __init__.py プロジェクト: Jordan-Sauchuk/allennlp
 def from_params(self, params: Params) -> PytorchSeq2SeqWrapper:
     if not params.pop_bool('batch_first', True):
         raise ConfigurationError("Our encoder semantics assumes batch is always first!")
     if self._module_class in self.PYTORCH_MODELS:
         params['batch_first'] = True
     module = self._module_class(**params.as_dict())
     return PytorchSeq2SeqWrapper(module)
コード例 #5
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'BasicTextFieldEmbedder':
     token_embedders = {}
     keys = list(params.keys())
     for key in keys:
         embedder_params = params.pop(key)
         token_embedders[key] = TokenEmbedder.from_params(vocab, embedder_params)
     params.assert_empty(cls.__name__)
     return cls(token_embedders)
コード例 #6
0
ファイル: snli.py プロジェクト: Jordan-Sauchuk/allennlp
 def from_params(cls, params: Params) -> 'SnliReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer,
                       token_indexers=token_indexers,
                       lazy=lazy)
コード例 #7
0
 def from_params(cls, params: Params) -> 'BasicIterator':
     batch_size = params.pop_int('batch_size', 32)
     instances_per_epoch = params.pop_int('instances_per_epoch', None)
     max_instances_in_memory = params.pop_int('max_instances_in_memory', None)
     params.assert_empty(cls.__name__)
     return cls(batch_size=batch_size,
                instances_per_epoch=instances_per_epoch,
                max_instances_in_memory=max_instances_in_memory)
コード例 #8
0
 def from_params(cls, params: Params) -> 'PennTreeBankConstituencySpanDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     use_pos_tags = params.pop('use_pos_tags', True)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return PennTreeBankConstituencySpanDatasetReader(token_indexers=token_indexers,
                                                      use_pos_tags=use_pos_tags,
                                                      lazy=lazy)
コード例 #9
0
            def from_params(cls, params: Params) -> 'B':
                params.add_file_to_archive("filename")

                filename = params.pop("filename")
                c_params = params.pop("c")
                c = C.from_params(c_params)

                return cls(filename, c)
コード例 #10
0
 def from_params(cls, params: Params) -> 'SrlReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     domain_identifier = params.pop("domain_identifier", None)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return SrlReader(token_indexers=token_indexers,
                      domain_identifier=domain_identifier,
                      lazy=lazy)
コード例 #11
0
ファイル: linear.py プロジェクト: Jordan-Sauchuk/allennlp
 def from_params(cls, params: Params) -> 'LinearSimilarity':
     tensor_1_dim = params.pop_int("tensor_1_dim")
     tensor_2_dim = params.pop_int("tensor_2_dim")
     combination = params.pop("combination", "x,y")
     activation = Activation.by_name(params.pop("activation", "linear"))()
     params.assert_empty(cls.__name__)
     return cls(tensor_1_dim=tensor_1_dim,
                tensor_2_dim=tensor_2_dim,
                combination=combination,
                activation=activation)
コード例 #12
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'SpanConstituencyParser':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        span_extractor = SpanExtractor.from_params(params.pop("span_extractor"))
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))

        feed_forward_params = params.pop("feedforward", None)
        if feed_forward_params is not None:
            feedforward_layer = FeedForward.from_params(feed_forward_params)
        else:
            feedforward_layer = None
        pos_tag_embedding_params = params.pop("pos_tag_embedding", None)
        if pos_tag_embedding_params is not None:
            pos_tag_embedding = Embedding.from_params(vocab, pos_tag_embedding_params)
        else:
            pos_tag_embedding = None
        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
        evalb_directory_path = params.pop("evalb_directory_path", None)
        params.assert_empty(cls.__name__)

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   span_extractor=span_extractor,
                   encoder=encoder,
                   feedforward_layer=feedforward_layer,
                   pos_tag_embedding=pos_tag_embedding,
                   initializer=initializer,
                   regularizer=regularizer,
                   evalb_directory_path=evalb_directory_path)
コード例 #13
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder':  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault("vocab_namespace", "token_characters")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
コード例 #14
0
    def test_mismatched_dimensions_raise_configuration_errors(self):
        params = Params.from_file(self.param_file)
        # Make the input_dim to the first feedforward_layer wrong - it should be 2.
        params["model"]["attend_feedforward"]["input_dim"] = 10
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=params.pop("model"))

        params = Params.from_file(self.param_file)
        # Make the projection output_dim of the last layer wrong - it should be
        # 3, equal to the number of classes.
        params["model"]["aggregate_feedforward"]["output_dim"] = 10
        with pytest.raises(ConfigurationError):
            Model.from_params(vocab=self.vocab, params=params.pop("model"))
コード例 #15
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'SimpleTagger':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))

        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
        params.assert_empty(cls.__name__)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   encoder=encoder,
                   initializer=initializer,
                   regularizer=regularizer)
コード例 #16
0
ファイル: vocabulary.py プロジェクト: ziaridoy20/allennlp
    def extend_from_instances(self,
                              params: Params,
                              instances: Iterable['adi.Instance'] = ()) -> None:
        """
        Extends an already generated vocabulary using a collection of instances.
        """
        min_count = params.pop("min_count", None)
        max_vocab_size = pop_max_vocab_size(params)
        non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES)
        pretrained_files = params.pop("pretrained_files", {})
        min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None)
        only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False)
        tokens_to_add = params.pop("tokens_to_add", None)
        params.assert_empty("Vocabulary - from dataset")

        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)
        self._extend(counter=namespace_token_counts,
                     min_count=min_count,
                     max_vocab_size=max_vocab_size,
                     non_padded_namespaces=non_padded_namespaces,
                     pretrained_files=pretrained_files,
                     only_include_pretrained_words=only_include_pretrained_words,
                     tokens_to_add=tokens_to_add,
                     min_pretrained_embeddings=min_pretrained_embeddings)
コード例 #17
0
    def setUp(self):
        super().setUp()
        params = Params({
                "model": {
                    "type": "simple_tagger",
                    "text_field_embedder": {
                        "token_embedders": {
                            "tokens": {
                                "type": "embedding",
                                "embedding_dim": 5
                            }
                        }
                    },
                    "encoder": {
                        "type": "lstm",
                        "input_size": 5,
                        "hidden_size": 7,
                        "num_layers": 2
                    }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                    "cuda_device": -1,
                    "num_epochs": 2,
                    "optimizer": "adam"
                }
            })
        all_datasets = datasets_from_params(params)
        vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for dataset in all_datasets.values()
             for instance in dataset)
        )
        model = Model.from_params(vocab=vocab, params=params.pop('model'))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(vocab)
        train_data = all_datasets['train']
        trainer_params = params.pop("trainer")
        serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate')

        self.trainer = Trainer.from_params(model,
                                           serialization_dir,
                                           iterator,
                                           train_data,
                                           params=trainer_params,
                                           validation_data=None,
                                           validation_iterator=None)
コード例 #18
0
ファイル: bidaf_test.py プロジェクト: Jordan-Sauchuk/allennlp
    def test_batch_predictions_are_consistent(self):
        # The CNN encoder has problems with this kind of test - it's not properly masked yet, so
        # changing the amount of padding in the batch will result in small differences in the
        # output of the encoder.  Because BiDAF is so deep, these differences get magnified through
        # the network and make this test impossible.  So, we'll remove the CNN encoder entirely
        # from the model for this test.  If/when we fix the CNN encoder to work correctly with
        # masking, we can change this back to how the other models run this test, with just a
        # single line.
        # pylint: disable=protected-access,attribute-defined-outside-init

        # Save some state.
        saved_model = self.model
        saved_instances = self.instances

        # Modify the state, run the test with modified state.
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])
        reader._token_indexers = {'tokens': reader._token_indexers['tokens']}
        self.instances = reader.read('tests/fixtures/data/squad.json')
        vocab = Vocabulary.from_instances(self.instances)
        for instance in self.instances:
            instance.index_fields(vocab)
        del params['model']['text_field_embedder']['token_characters']
        params['model']['phrase_layer']['input_size'] = 2
        self.model = Model.from_params(vocab, params['model'])

        self.ensure_batch_predictions_are_consistent()

        # Restore the state.
        self.model = saved_model
        self.instances = saved_instances
コード例 #19
0
    def test_file_archiving(self):
        # This happens to be a good place to test auxiliary file archiving.
        # Train the model
        params = Params.from_file(self.FIXTURES_ROOT / 'elmo' / 'config' / 'characters_token_embedder.json')
        serialization_dir = os.path.join(self.TEST_DIR, 'serialization')
        train_model(params, serialization_dir)

        # Inspect the archive
        archive_file = os.path.join(serialization_dir, 'model.tar.gz')
        unarchive_dir = os.path.join(self.TEST_DIR, 'unarchive')
        with tarfile.open(archive_file, 'r:gz') as archive:
            archive.extractall(unarchive_dir)

        # It should contain `files_to_archive.json`
        fta_file = os.path.join(unarchive_dir, 'files_to_archive.json')
        assert os.path.exists(fta_file)

        # Which should properly contain { flattened_key -> original_filename }
        with open(fta_file) as fta:
            files_to_archive = json.loads(fta.read())

        assert files_to_archive == {
                'model.text_field_embedder.token_embedders.elmo.options_file':
                        str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'options.json'),
                'model.text_field_embedder.token_embedders.elmo.weight_file':
                        str(pathlib.Path('allennlp') / 'tests' / 'fixtures' / 'elmo' / 'lm_weights.hdf5'),
        }

        # Check that the unarchived contents of those files match the original contents.
        for key, original_filename in files_to_archive.items():
            new_filename = os.path.join(unarchive_dir, "fta", key)
            assert filecmp.cmp(original_filename, new_filename)
コード例 #20
0
ファイル: archival_test.py プロジェクト: pyknife/allennlp
    def setUp(self):
        super().setUp()

        self.params = Params({
                "model": {
                        "type": "simple_tagger",
                        "text_field_embedder": {
                                "tokens": {
                                        "type": "embedding",
                                        "embedding_dim": 5
                                }
                        },
                        "encoder": {
                                "type": "lstm",
                                "input_size": 5,
                                "hidden_size": 7,
                                "num_layers": 2
                        }
                },
                "dataset_reader": {"type": "sequence_tagging"},
                "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'),
                "iterator": {"type": "basic", "batch_size": 2},
                "trainer": {
                        "num_epochs": 2,
                        "optimizer": "adam",
                }
        })
コード例 #21
0
ファイル: train.py プロジェクト: Jordan-Sauchuk/allennlp
def train_model_from_file(parameter_filename: str,
                          serialization_dir: str,
                          overrides: str = "",
                          file_friendly_logging: bool = False,
                          recover: bool = False) -> Model:
    """
    A wrapper around :func:`train_model` which loads the params from a file.

    Parameters
    ----------
    param_path : ``str``
        A json parameter file specifying an AllenNLP experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs. We just pass this along to
        :func:`train_model`.
    overrides : ``str``
        A HOCON string that we will use to override values in the input parameter file.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we make our output more friendly to saved model files.  We just pass this
        along to :func:`train_model`.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    # Load the experiment config from a file and pass it to ``train_model``.
    params = Params.from_file(parameter_filename, overrides)
    return train_model(params, serialization_dir, file_friendly_logging, recover)
コード例 #22
0
ファイル: copynet_test.py プロジェクト: apmoore1/allennlp
 def setUp(self):
     super(TestCopyNetReader, self).setUp()
     params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
     self.instances = ensure_list(instances)
     self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
コード例 #23
0
ファイル: fine_tune.py プロジェクト: pyknife/allennlp
def fine_tune_model_from_file_paths(model_archive_path: str,
                                    config_file: str,
                                    serialization_dir: str,
                                    overrides: str = "",
                                    file_friendly_logging: bool = False) -> Model:
    """
    A wrapper around :func:`fine_tune_model` which loads the model archive from a file.

    Parameters
    ----------
    model_archive_path : ``str``
        Path to a saved model archive that is the result of running the ``train`` command.
    config_file : ``str``
        A configuration file specifying how to continue training.  The format is identical to the
        configuration file for the ``train`` command, but any contents in the ``model`` section is
        ignored (as we are using the provided model archive instead).
    serialization_dir : ``str``
        The directory in which to save results and logs. We just pass this along to
        :func:`fine_tune_model`.
    overrides : ``str``
        A JSON string that we will use to override values in the input parameter file.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we make our output more friendly to saved model files.  We just pass this
        along to :func:`fine_tune_model`.
    """
    # We don't need to pass in `cuda_device` here, because the trainer will call `model.cuda()` if
    # necessary.
    archive = load_archive(model_archive_path)
    params = Params.from_file(config_file, overrides)
    return fine_tune_model(model=archive.model,
                           params=params,
                           serialization_dir=serialization_dir,
                           file_friendly_logging=file_friendly_logging)
コード例 #24
0
    def test_load_from_file(self):
        filename = 'tests/fixtures/bidaf/experiment.json'
        params = Params.from_file(filename)

        assert "dataset_reader" in params
        assert "trainer" in params

        model_params = params.pop("model")
        assert model_params.pop("type") == "bidaf"
コード例 #25
0
    def from_params(cls, params: Params):
        input_dim = params.pop_int('input_dim')
        hidden_dim = params.pop_int('hidden_dim')
        projection_dim = params.pop_int('projection_dim', None)
        feedforward_hidden_dim = params.pop_int("feedforward_hidden_dim")
        num_layers = params.pop_int("num_layers", 2)
        num_attention_heads = params.pop_int('num_attention_heads', 3)
        use_positional_encoding = params.pop_bool('use_positional_encoding', True)
        dropout_prob = params.pop_float("dropout_prob", 0.2)
        params.assert_empty(cls.__name__)

        return cls(input_dim=input_dim,
                   hidden_dim=hidden_dim,
                   feedforward_hidden_dim=feedforward_hidden_dim,
                   projection_dim=projection_dim,
                   num_layers=num_layers,
                   num_attention_heads=num_attention_heads,
                   use_positional_encoding=use_positional_encoding,
                   dropout_prob=dropout_prob)
コード例 #26
0
ファイル: bidaf_ensemble.py プロジェクト: apmoore1/allennlp
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'BidafEnsemble':  # type: ignore
        # pylint: disable=arguments-differ
        if vocab:
            raise ConfigurationError("vocab should be None")

        submodels = []
        paths = params.pop("submodels")
        for path in paths:
            submodels.append(load_archive(path).model)

        return cls(submodels=submodels)
コード例 #27
0
def find_learning_rate_from_args(args: argparse.Namespace) -> None:
    """
    Start learning rate finder for given args
    """
    params = Params.from_file(args.param_path, args.overrides)
    find_learning_rate_model(params, args.serialization_dir,
                             start_lr=args.start_lr,
                             end_lr=args.end_lr,
                             num_batches=args.num_batches,
                             linear_steps=args.linear,
                             stopping_factor=args.stopping_factor,
                             force=args.force)
コード例 #28
0
ファイル: bidaf_test.py プロジェクト: Jordan-Sauchuk/allennlp
    def test_mismatching_dimensions_throws_configuration_error(self):
        params = Params.from_file(self.param_file)
        # Make the phrase layer wrong - it should be 10 to match
        # the embedding + char cnn dimensions.
        params["model"]["phrase_layer"]["input_size"] = 12
        with pytest.raises(ConfigurationError):
            Model.from_params(self.vocab, params.pop("model"))

        params = Params.from_file(self.param_file)
        # Make the modeling layer input_dimension wrong - it should be 40 to match
        # 4 * output_dim of the phrase_layer.
        params["model"]["phrase_layer"]["input_size"] = 30
        with pytest.raises(ConfigurationError):
            Model.from_params(self.vocab, params.pop("model"))

        params = Params.from_file(self.param_file)
        # Make the modeling layer input_dimension wrong - it should be 70 to match
        # 4 * phrase_layer.output_dim + 3 * modeling_layer.output_dim.
        params["model"]["span_end_encoder"]["input_size"] = 50
        with pytest.raises(ConfigurationError):
            Model.from_params(self.vocab, params.pop("model"))
コード例 #29
0
 def from_params(cls, params: Params) -> 'WordSplitter':
     language = params.pop('language', 'en_core_web_sm')
     pos_tags = params.pop_bool('pos_tags', False)
     parse = params.pop_bool('parse', False)
     ner = params.pop_bool('ner', False)
     params.assert_empty(cls.__name__)
     return cls(language, pos_tags, parse, ner)
コード例 #30
0
 def from_params(cls, params: Params) -> 'MultiHeadedSimilarity':
     num_heads = params.pop_int("num_heads")
     tensor_1_dim = params.pop_int("tensor_1_dim")
     tensor_1_projected_dim = params.pop_int("tensor_1_projected_dim", None)
     tensor_2_dim = params.pop_int("tensor_2_dim", None)
     tensor_2_projected_dim = params.pop_int("tensor_1_projected_dim", None)
     internal_similarity = SimilarityFunction.from_params(params.pop("internal_similarity", {}))
     params.assert_empty(cls.__name__)
     return cls(num_heads=num_heads,
                tensor_1_dim=tensor_1_dim,
                tensor_1_projected_dim=tensor_1_projected_dim,
                tensor_2_dim=tensor_2_dim,
                tensor_2_projected_dim=tensor_2_projected_dim,
                internal_similarity=internal_similarity)
コード例 #31
0
def load_config_file_print_params(params_fname: str, param_depth: int = -1):
    params = Params.from_file(params_fname)
    print_params_at_depth(params, 1, depth_cap=param_depth)
コード例 #32
0
    def test_can_build_from_params(self):
        reader = QangarooReader.from_params(Params({}))

        assert reader._token_indexers[
            "tokens"].__class__.__name__ == "SingleIdTokenIndexer"
コード例 #33
0
ファイル: quac_test.py プロジェクト: ha-lins/medical_dialog
    def test_read(self, lazy):
        params = Params({"lazy": lazy, "num_context_answers": 2})
        reader = QuACReader.from_params(params)
        instances = reader.read(
            str(AllenNlpTestCase.FIXTURES_ROOT / "data" / "quac_sample.json"))
        instances = ensure_list(instances)

        assert instances[0].fields["question"].sequence_length() == 6
        assert instances[0].fields["yesno_list"].sequence_length() == 6
        assert [
            t.text
            for t in instances[0].fields["question"].field_list[0].tokens[:3]
        ] == [
            "What",
            "was",
            "the",
        ]

        assert len(instances) == 2
        passage_length = len(instances[0].fields["passage"].tokens)

        assert [t.text for t in instances[0].fields["passage"].tokens[:3]
                ] == ["DJ", "Kool", "Herc"]
        assert [x.label
                for x in instances[0].fields["yesno_list"].field_list] == [
                    "x",
                    "x",
                    "y",
                    "x",
                    "x",
                    "x",
                ]
        assert [
            x.label for x in instances[0].fields["followup_list"].field_list
        ] == [
            "y",
            "m",
            "m",
            "n",
            "m",
            "y",
        ]
        assert (instances[0].fields["p1_answer_marker"].field_list[0].labels ==
                ["O"] * passage_length)

        # Check the previous answer marking here
        prev_1_list = ["O"] * passage_length
        prev_2_list = ["O"] * passage_length
        q0_span_start = instances[0].fields["span_start"].field_list[
            0].sequence_index
        q0_span_end = instances[0].fields["span_end"].field_list[
            0].sequence_index
        prev_1_list[q0_span_start] = "<{0:d}_{1:s}>".format(1, "start")
        prev_1_list[q0_span_end] = "<{0:d}_{1:s}>".format(1, "end")
        prev_2_list[q0_span_start] = "<{0:d}_{1:s}>".format(2, "start")
        prev_2_list[q0_span_end] = "<{0:d}_{1:s}>".format(2, "end")
        for passage_index in range(q0_span_start + 1, q0_span_end):
            prev_1_list[passage_index] = "<{0:d}_{1:s}>".format(1, "in")
            prev_2_list[passage_index] = "<{0:d}_{1:s}>".format(2, "in")

        assert instances[0].fields["p1_answer_marker"].field_list[
            1].labels == prev_1_list
        assert instances[0].fields["p2_answer_marker"].field_list[
            2].labels == prev_2_list
コード例 #34
0
 def from_params(cls, params: Params) -> 'BeamSearch':
     beam_size = params.pop('beam_size')
     return cls(beam_size=beam_size)
コード例 #35
0
ファイル: elmo_token_embedder.py プロジェクト: wgc20/GrailQA
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'ElmoTokenEmbedder':  # type: ignore
     # pylint: disable=arguments-differ
     params.add_file_to_archive('options_file')
     params.add_file_to_archive('weight_file')
     options_file = params.pop('options_file')
     weight_file = params.pop('weight_file')
     requires_grad = params.pop('requires_grad', False)
     do_layer_norm = params.pop_bool('do_layer_norm', False)
     dropout = params.pop_float("dropout", 0.5)
     namespace_to_cache = params.pop("namespace_to_cache", None)
     if namespace_to_cache is not None:
         vocab_to_cache = list(
             vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
     else:
         vocab_to_cache = None
     projection_dim = params.pop_int("projection_dim", None)
     scalar_mix_parameters = params.pop('scalar_mix_parameters', None)
     params.assert_empty(cls.__name__)
     return cls(options_file=options_file,
                weight_file=weight_file,
                do_layer_norm=do_layer_norm,
                dropout=dropout,
                requires_grad=requires_grad,
                projection_dim=projection_dim,
                vocab_to_cache=vocab_to_cache,
                scalar_mix_parameters=scalar_mix_parameters)
コード例 #36
0
ファイル: trainer.py プロジェクト: zhyq/allennlp
    def from_params(cls,
                    model: Model,
                    serialization_dir: str,
                    iterator: DataIterator,
                    train_data: Iterable[Instance],
                    validation_data: Optional[Iterable[Instance]],
                    params: Params) -> 'Trainer':

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = params.pop_int("cuda_device", -1)
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)

        if cuda_device >= 0:
            model = model.cuda(cuda_device)
        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        if lr_scheduler_params:
            scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params)
        else:
            scheduler = None

        num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", None)
        keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)

        params.assert_empty(cls.__name__)
        return Trainer(model, optimizer, iterator,
                       train_data, validation_data,
                       patience=patience,
                       validation_metric=validation_metric,
                       num_epochs=num_epochs,
                       serialization_dir=serialization_dir,
                       cuda_device=cuda_device,
                       grad_norm=grad_norm,
                       grad_clipping=grad_clipping,
                       learning_rate_scheduler=scheduler,
                       num_serialized_models_to_keep=num_serialized_models_to_keep,
                       keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds,
                       model_save_interval=model_save_interval,
                       summary_interval=summary_interval,
                       histogram_interval=histogram_interval)
コード例 #37
0
ファイル: data_iterator.py プロジェクト: uwnlp/allennlp
    def from_params(cls, params: Params) -> 'DataIterator':
        # TODO(Mark): The adaptive iterator will need a bit of work here,
        # to retrieve the scaling function etc.

        iterator_type = params.pop_choice("type", cls.list_available())
        return cls.by_name(iterator_type).from_params(params)
コード例 #38
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ProGlobal':
        token_embedder_params = params.pop("text_field_embedder")
        pos_embedder_params = params.pop("pos_field_embedder")
        sent_pos_embedder_params = params.pop("sent_pos_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(
            vocab, token_embedder_params)
        pos_field_embedder = TextFieldEmbedder.from_params(
            vocab, pos_embedder_params)
        sent_pos_field_embedder = TextFieldEmbedder.from_params(
            vocab, sent_pos_embedder_params)

        modeling_layer = Seq2SeqEncoder.from_params(
            params.pop("modeling_layer"))
        span_end_encoder_before = Seq2SeqEncoder.from_params(
            params.pop("span_end_encoder_bef"))
        span_start_encoder_after = Seq2SeqEncoder.from_params(
            params.pop("span_start_encoder_aft"))
        span_end_encoder_after = Seq2SeqEncoder.from_params(
            params.pop("span_end_encoder_aft"))
        dropout = params.pop('dropout', 0.2)

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        params.assert_empty(cls.__name__)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   pos_field_embedder=pos_field_embedder,
                   sent_pos_field_embedder=sent_pos_field_embedder,
                   modeling_layer=modeling_layer,
                   span_start_encoder_after=span_start_encoder_after,
                   span_end_encoder_before=span_end_encoder_before,
                   span_end_encoder_after=span_end_encoder_after,
                   dropout=dropout,
                   initializer=initializer)
コード例 #39
0
ファイル: embedding.py プロジェクト: zyjcs/allennlp
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding':  # type: ignore
        """
        We need the vocabulary here to know how many items we need to embed, and we look for a
        ``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use.  If
        you know beforehand exactly how many embeddings you need, or aren't using a vocabulary
        mapping for the things getting embedded here, then you can pass in the ``num_embeddings``
        key directly, and the vocabulary will be ignored.

        In the configuration file, a file containing pretrained embeddings can be specified
        using the parameter ``"pretrained_file"``.
        It can be the path to a local file or an URL of a (cached) remote file.
        Two formats are supported:

            * hdf5 file - containing an embedding matrix in the form of a torch.Tensor;

            * text file - an utf-8 encoded text file with space separated fields::

                    [word] [dim 1] [dim 2] ...

              The text file can eventually be compressed with gzip, bz2, lzma or zip.
              You can even select a single file inside an archive containing multiple files
              using the URI::

                    "(archive_uri)#file_path_inside_the_archive"

              where ``archive_uri`` can be a file system path or a URL. For example::

                    "(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt"
        """
        # pylint: disable=arguments-differ
        num_embeddings = params.pop_int('num_embeddings', None)
        vocab_namespace = params.pop("vocab_namespace", "tokens")
        if num_embeddings is None:
            num_embeddings = vocab.get_vocab_size(vocab_namespace)
        embedding_dim = params.pop_int('embedding_dim')
        pretrained_file = params.pop("pretrained_file", None)
        projection_dim = params.pop_int("projection_dim", None)
        trainable = params.pop_bool("trainable", True)
        padding_index = params.pop_int('padding_index', None)
        max_norm = params.pop_float('max_norm', None)
        norm_type = params.pop_float('norm_type', 2.)
        scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
        sparse = params.pop_bool('sparse', False)
        params.assert_empty(cls.__name__)

        if pretrained_file:
            # If we're loading a saved model, we don't want to actually read a pre-trained
            # embedding file - the embeddings will just be in our saved weights, and we might not
            # have the original embedding file anymore, anyway.
            weight = _read_pretrained_embeddings_file(pretrained_file,
                                                      embedding_dim,
                                                      vocab,
                                                      vocab_namespace)
        else:
            weight = None

        return cls(num_embeddings=num_embeddings,
                   embedding_dim=embedding_dim,
                   projection_dim=projection_dim,
                   weight=weight,
                   padding_index=padding_index,
                   trainable=trainable,
                   max_norm=max_norm,
                   norm_type=norm_type,
                   scale_grad_by_freq=scale_grad_by_freq,
                   sparse=sparse)
コード例 #40
0
ファイル: trainer.py プロジェクト: richinkabra/CoVe-BCN
    def from_params(
            cls,  # type: ignore
            model: Model,
            serialization_dir: str,
            iterator: DataIterator,
            train_data: Iterable[Instance],
            validation_data: Optional[Iterable[Instance]],
            params: Params,
            validation_iterator: DataIterator = None) -> 'Trainer':
        # pylint: disable=arguments-differ
        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(
                optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
                    'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int(
                "num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=
                keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool(
            "should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate",
                                                   False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average)
コード例 #41
0
 def from_params(cls, params: Params) -> 'SimilarityFunction':
     choice = params.pop_choice('type',
                                cls.list_available(),
                                default_to_first_choice=True)
     return cls.by_name(choice).from_params(params)
コード例 #42
0
def main(serialization_directory: str,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(
        os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config["dataset_reader"])
    evaluation_data_path = data if data else config["validation_data_path"]

    archive = load_archive(os.path.join(serialization_directory,
                                        "model.tar.gz"),
                           cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory,
                                        prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory,
                                  prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    dataset = dataset_reader.read(evaluation_data_path)

    with torch.autograd.no_grad():
        loader = DataLoader(dataset,
                            sampler=SequentialSampler(dataset),
                            batch_size=32)
        model_predictions: List[List[str]] = []
        for batch in Tqdm.tqdm(loader):
            batch = move_to_device(batch, device)
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(dataset, model_predictions):
            fields = instance.fields
            verb_index = fields["metadata"]["verb_index"]
            gold_tags = fields["metadata"]["gold_tags"]
            sentence = fields["metadata"]["words"]
            write_to_conll_eval_file(prediction_file, gold_file, verb_index,
                                     sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
コード例 #43
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'BidirectionalAttentionFlow':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(
            vocab, embedder_params)
        num_highway_layers = params.pop_int("num_highway_layers")
        phrase_layer = Seq2SeqEncoder.from_params(params.pop("phrase_layer"))
        similarity_function = SimilarityFunction.from_params(
            params.pop("similarity_function"))
        modeling_layer = Seq2SeqEncoder.from_params(
            params.pop("modeling_layer"))
        span_end_encoder = Seq2SeqEncoder.from_params(
            params.pop("span_end_encoder"))
        dropout = params.pop_float('dropout', 0.2)

        initializer = InitializerApplicator.from_params(
            params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(
            params.pop('regularizer', []))

        mask_lstms = params.pop_bool('mask_lstms', True)
        params.assert_empty(cls.__name__)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   num_highway_layers=num_highway_layers,
                   phrase_layer=phrase_layer,
                   attention_similarity_function=similarity_function,
                   modeling_layer=modeling_layer,
                   span_end_encoder=span_end_encoder,
                   dropout=dropout,
                   mask_lstms=mask_lstms,
                   initializer=initializer,
                   regularizer=regularizer)
コード例 #44
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ModelMSMARCO':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(
            vocab, embedder_params)
        #num_highway_layers = params.pop_int("num_highway_layers")
        phrase_layer = Seq2SeqEncoder.from_params(params.pop("phrase_layer"))
        similarity_function = SimilarityFunction.from_params(
            params.pop("similarity_function"))
        residual_encoder = Seq2SeqEncoder.from_params(
            params.pop("residual_encoder"))
        span_start_encoder = Seq2SeqEncoder.from_params(
            params.pop("span_start_encoder"))
        span_end_encoder = Seq2SeqEncoder.from_params(
            params.pop("span_end_encoder"))
        #feed_forward = FeedForward.from_params(params.pop("feed_forward"))
        dropout = params.pop_float('dropout', 0.2)

        initializer = InitializerApplicator.from_params(
            params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(
            params.pop('regularizer', []))

        mask_lstms = params.pop_bool('mask_lstms', True)
        params.assert_empty(cls.__name__)
        return cls(
            vocab=vocab,
            text_field_embedder=text_field_embedder,
            #       num_highway_layers=num_highway_layers,
            phrase_layer=phrase_layer,
            attention_similarity_function=similarity_function,
            residual_encoder=residual_encoder,
            span_start_encoder=span_start_encoder,
            span_end_encoder=span_end_encoder,
            dropout=dropout,
            mask_lstms=mask_lstms,
            initializer=initializer,
            regularizer=regularizer)
コード例 #45
0
ファイル: train.py プロジェクト: mdunnlp/allennlp
def create_serialization_dir(params: Params, serialization_dir: str,
                             recover: bool) -> None:
    """
    This function creates the serialization directory if it doesn't exist.  If it already exists,
    then it verifies that we're recovering from a training with an identical configuration.

    Parameters
    ----------
    params: ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: ``str``
        The directory in which to save results and logs.
    recover: ``bool``
        If ``True``, we will try to recover from an existing serialization directory, and crash if
        the directory doesn't exist, or doesn't match the configuration we're given.
    """
    if os.path.exists(serialization_dir):
        if serialization_dir == '/output':
            # Special-casing the beaker output directory, which will already exist when training
            # starts.
            return
        if not recover:
            raise ConfigurationError(
                f"Serialization directory ({serialization_dir}) already exists.  "
                f"Specify --recover to recover training from existing output.")

        logger.info(f"Recovering from prior training at {serialization_dir}.")

        recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME)
        if not os.path.exists(recovered_config_file):
            raise ConfigurationError(
                "The serialization directory already exists but doesn't "
                "contain a config.json. You probably gave the wrong directory."
            )
        else:
            loaded_params = Params.from_file(recovered_config_file)

            # Check whether any of the training configuration differs from the configuration we are
            # resuming.  If so, warn the user that training may fail.
            fail = False
            flat_params = params.as_flat_dict()
            flat_loaded = loaded_params.as_flat_dict()
            for key in flat_params.keys() - flat_loaded.keys():
                logger.error(
                    f"Key '{key}' found in training configuration but not in the serialization "
                    f"directory we're recovering from.")
                fail = True
            for key in flat_loaded.keys() - flat_params.keys():
                logger.error(
                    f"Key '{key}' found in the serialization directory we're recovering from "
                    f"but not in the training config.")
                fail = True
            for key in flat_params.keys():
                if flat_params.get(key, None) != flat_loaded.get(key, None):
                    logger.error(
                        f"Value for '{key}' in training configuration does not match that the value in "
                        f"the serialization directory we're recovering from: "
                        f"{flat_params[key]} != {flat_loaded[key]}")
                    fail = True
            if fail:
                raise ConfigurationError(
                    "Training configuration does not match the configuration we're "
                    "recovering from.")
    else:
        if recover:
            raise ConfigurationError(
                f"--recover specified but serialization_dir ({serialization_dir}) "
                "does not exist.  There is nothing to recover from.")
        os.makedirs(serialization_dir)
コード例 #46
0
def find_learning_rate_model(params: Params,
                             serialization_dir: str,
                             start_lr: float = 1e-5,
                             end_lr: float = 10,
                             num_batches: int = 100,
                             linear_steps: bool = False,
                             stopping_factor: float = None,
                             force: bool = False) -> None:
    """
    Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir``

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results.
    start_lr: ``float``
        Learning rate to start the search.
    end_lr: ``float``
        Learning rate upto which search is done.
    num_batches: ``int``
        Number of mini-batches to run Learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    force: ``bool``
        If True and the serialization directory already exists, everything in it will
        be removed prior to finding the learning rate.
    """
    if os.path.exists(serialization_dir) and force:
        shutil.rmtree(serialization_dir)

    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        raise ConfigurationError(
            f'Serialization directory {serialization_dir} already exists and is '
            f'not empty.')
    else:
        os.makedirs(serialization_dir, exist_ok=True)

    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    trainer_choice = trainer_params.pop("type", "default")
    if trainer_choice != "default":
        raise ConfigurationError(
            "currently find-learning-rate only works with the default Trainer")
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=serialization_dir,
                                  iterator=iterator,
                                  train_data=train_data,
                                  validation_data=None,
                                  params=trainer_params,
                                  validation_iterator=None)

    logger.info(
        f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.'
    )
    learning_rates, losses = search_learning_rate(
        trainer,
        start_lr=start_lr,
        end_lr=end_lr,
        num_batches=num_batches,
        linear_steps=linear_steps,
        stopping_factor=stopping_factor)
    logger.info(f'Finished learning rate search.')
    losses = _smooth(losses, 0.98)

    _save_plot(learning_rates, losses,
               os.path.join(serialization_dir, 'lr-losses.png'))
コード例 #47
0
ファイル: train.py プロジェクト: zyxdSTU/allennlp
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False,
                cache_directory: str = None,
                cache_prefix: str = None) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    cache_directory : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.
    cache_prefix : ``str``, optional
        For caching data pre-processing.  See :func:`allennlp.training.util.datasets_from_params`.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    create_serialization_dir(params, serialization_dir, recover, force)
    stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging)
    prepare_environment(params)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)

    trainer_type = params.get("trainer", {}).get("type", "default")

    if trainer_type == "default":
        # Special logic to instantiate backward-compatible trainer.
        pieces = TrainerPieces.from_params(params,  # pylint: disable=no-member
                                           serialization_dir,
                                           recover,
                                           cache_directory,
                                           cache_prefix)
        trainer = Trainer.from_params(
                model=pieces.model,
                serialization_dir=serialization_dir,
                iterator=pieces.iterator,
                train_data=pieces.train_dataset,
                validation_data=pieces.validation_dataset,
                params=pieces.params,
                validation_iterator=pieces.validation_iterator)

        evaluation_iterator = pieces.validation_iterator or pieces.iterator
        evaluation_dataset = pieces.test_dataset

    else:
        if evaluate_on_test:
            raise ValueError("--evaluate-on-test only works with the default Trainer. "
                             "If you're using the CallbackTrainer you can use a callback "
                             "to evaluate at Events.TRAINING_END; otherwise you'll have "
                             "to run allennlp evaluate separately.")

        trainer = TrainerBase.from_params(params, serialization_dir, recover)
        evaluation_dataset = None

    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Evaluate
    if evaluation_dataset and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(trainer.model, evaluation_dataset, evaluation_iterator,
                                cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
                                # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
                                batch_weight_key="")

        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif evaluation_dataset:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    cleanup_global_logging(stdout_handler)

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)
    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    # We count on the trainer to have the model with best weights
    return trainer.model
コード例 #48
0
ファイル: train.py プロジェクト: siddsach/allennlp
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)

    # TODO(mattg): pull this block out into a separate function (maybe just add this to
    # `prepare_environment`?)
    Tqdm.set_slower_interval(file_friendly_logging)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), # type: ignore
                           sys.stdout,
                           file_friendly_logging)
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), # type: ignore
                           sys.stderr,
                           file_friendly_logging)
    handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    metrics = trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
コード例 #49
0
class ArchivalTest(AllenNlpTestCase):
    def setUp(self):
        super().setUp()

        self.params = Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "tokens": {
                        "type": "embedding",
                        "embedding_dim": 5
                    }
                },
                "stacked_encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                }
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
            "validation_data_path": 'tests/fixtures/data/sequence_tagging.tsv',
            "iterator": {
                "type": "basic",
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam",
            }
        })

    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        model = train_model(self.params, serialization_dir=self.TEST_DIR)

        archive_path = os.path.join(self.TEST_DIR, "model.tar.gz")

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        # check that model weights are the same
        keys = set(model.state_dict().keys())
        keys2 = set(model2.state_dict().keys())

        assert keys == keys2

        for key in keys:
            assert torch.equal(model.state_dict()[key],
                               model2.state_dict()[key])

        # check that vocabularies are the same
        vocab = model.vocab
        vocab2 = model2.vocab

        assert vocab._token_to_index == vocab2._token_to_index  # pylint: disable=protected-access
        assert vocab._index_to_token == vocab2._index_to_token  # pylint: disable=protected-access

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_copy

    def test_extra_files(self):

        serialization_dir = os.path.join(self.TEST_DIR, 'serialization')

        # Train a model
        train_model(self.params, serialization_dir=serialization_dir)

        # Archive model, and also archive the training data
        files_to_archive = {
            "train_data_path": 'tests/fixtures/data/sequence_tagging.tsv'
        }
        archive_model(serialization_dir=serialization_dir,
                      files_to_archive=files_to_archive)

        archive = load_archive(os.path.join(serialization_dir, 'model.tar.gz'))
        params = archive.config

        # The param in the data should have been replaced with a temporary path
        # (which we don't know, but we know what it ends with).
        assert params.get('train_data_path').endswith('/fta/train_data_path')

        # The validation data path should be the same though.
        assert params.get('validation_data_path'
                          ) == 'tests/fixtures/data/sequence_tagging.tsv'
コード例 #50
0
    print(infos["extra.walkthrough"])

    cnt = 0
    while not all(dones) and cnt < 50:
        infos["gamefile"] = game_file
        commands = actor.act(obs, cumulative_rewards, dones, infos)
        obs, cumulative_rewards, dones, infos = env.step(commands)
        cnt += 1
    infos["gamefile"] = game_file
    actor.act(obs, cumulative_rewards, dones, infos)
    print(get_sample_history_trace(actor.history, game_file))


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("game_file", type=str)
    args = parser.parse_args()
    params = Params.from_file("configs/config.jsonnet")
    agent = SimpleNet(
        config=params["network"],
        device="cpu",
        vocab_size=params["training"]["vocab_size"],
    )
    agent.load_state_dict(torch.load(params["training"]["model_path"]))

    game_file = f"games/train/{args.game_file}"
    check_agent(game_file=game_file,
                agent_net=agent,
                train_params=params.pop("training"))
コード例 #51
0
    def ensure_model_can_train_save_and_load(
        self,
        param_file: str,
        tolerance: float = 1e-4,
        cuda_device: int = -1,
        gradients_to_ignore: Set[str] = None,
        overrides: str = "",
        disable_dropout: bool = True,
    ):
        """
        # Parameters

        param_file : `str`
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : `float`, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as `rtol` to
            `numpy.testing.assert_allclose`).
        cuda_device : `int`, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : `Set[str]`, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : `str`, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        disable_dropout : `bool`, optional (default = True)
            If True we will set all dropout to 0 before checking gradients. (Otherwise, with small
            datasets, you may get zero gradients because of unlucky dropout.)
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file,
                                      save_dir,
                                      overrides=overrides)
        loaded_model = load_archive(archive_file,
                                    cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(
                model.state_dict()[key].cpu().numpy(),
                loaded_model.state_dict()[key].cpu().numpy(),
                err_msg=key,
            )
        params = Params.from_file(param_file, params_overrides=overrides)
        reader = DatasetReader.from_params(params["dataset_reader"])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params["iterator"]
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        print("Reading with original model")
        model_dataset = reader.read(params["validation_data_path"])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        print("Reading with loaded model")
        loaded_dataset = reader.read(params["validation_data_path"])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch,
                                                      gradients_to_ignore,
                                                      disable_dropout)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key,
                                     1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, "stateful") and module.stateful:
                    module.reset_states()
        print("Predicting with original model")
        model_predictions = model(**model_batch)
        print("Predicting with loaded model")
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
コード例 #52
0
ファイル: model_test_case.py プロジェクト: feiga/bilm-tf
    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file,
                                    cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(
            iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(
            iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key,
                                     1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model
コード例 #53
0
    def test_train_model(self):
        params = lambda: Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sequence_tagging"
            },
            "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
            "data_loader": {
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam"
            },
        })

        train_model(params(),
                    serialization_dir=os.path.join(self.TEST_DIR,
                                                   "test_train_model"))

        # It's OK if serialization dir exists but is empty:
        serialization_dir2 = os.path.join(self.TEST_DIR, "empty_directory")
        assert not os.path.exists(serialization_dir2)
        os.makedirs(serialization_dir2)
        train_model(params(), serialization_dir=serialization_dir2)

        # It's not OK if serialization dir exists and has junk in it non-empty:
        serialization_dir3 = os.path.join(self.TEST_DIR, "non_empty_directory")
        assert not os.path.exists(serialization_dir3)
        os.makedirs(serialization_dir3)
        with open(os.path.join(serialization_dir3, "README.md"), "w") as f:
            f.write("TEST")

        with pytest.raises(ConfigurationError):
            train_model(params(), serialization_dir=serialization_dir3)

        # It's also not OK if serialization dir is a real serialization dir:
        with pytest.raises(ConfigurationError):
            train_model(params(),
                        serialization_dir=os.path.join(self.TEST_DIR,
                                                       "test_train_model"))

        # But it's OK if serialization dir exists and --recover is specified:
        train_model(
            params(),
            serialization_dir=os.path.join(self.TEST_DIR, "test_train_model"),
            recover=True,
        )

        # It's ok serialization dir exists and --force is specified (it will be deleted):
        train_model(params(),
                    serialization_dir=os.path.join(self.TEST_DIR,
                                                   "test_train_model"),
                    force=True)

        # But --force and --recover cannot both be specified
        with pytest.raises(ConfigurationError):
            train_model(
                params(),
                serialization_dir=os.path.join(self.TEST_DIR,
                                               "test_train_model"),
                force=True,
                recover=True,
            )
コード例 #54
0
    def test_train_model_distributed_with_sharded_reader(self):

        params = lambda: Params({
            "model": {
                "type": "simple_tagger",
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "embedding_dim": 5
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 5,
                    "hidden_size": 7,
                    "num_layers": 2
                },
            },
            "dataset_reader": {
                "type": "sharded",
                "base_reader": {
                    "type": "sequence_tagging"
                },
                "lazy": True,
            },
            "train_data_path": SEQUENCE_TAGGING_SHARDS_PATH,
            "validation_data_path": SEQUENCE_TAGGING_SHARDS_PATH,
            "data_loader": {
                "batch_size": 2
            },
            "trainer": {
                "num_epochs": 2,
                "optimizer": "adam"
            },
            "distributed": {
                "cuda_devices": [0, 1]
            },
        })

        out_dir = os.path.join(self.TEST_DIR, "test_distributed_train")
        train_model(params(), serialization_dir=out_dir)

        # Check that some logs specific to distributed
        # training are where we expect.
        serialized_files = os.listdir(out_dir)
        assert "stderr_worker0.log" in serialized_files
        assert "stdout_worker0.log" in serialized_files
        assert "stderr_worker1.log" in serialized_files
        assert "stdout_worker1.log" in serialized_files
        assert "model.tar.gz" in serialized_files

        # Check we can load the seralized model
        archive = load_archive(out_dir)
        assert archive.model

        # Check that we created a vocab from all the shards.
        tokens = archive.model.vocab._token_to_index["tokens"].keys()
        assert tokens == {
            "@@PADDING@@",
            "@@UNKNOWN@@",
            "are",
            ".",
            "animals",
            "plants",
            "vehicles",
            "cats",
            "dogs",
            "snakes",
            "birds",
            "ferns",
            "trees",
            "flowers",
            "vegetables",
            "cars",
            "buses",
            "planes",
            "rockets",
        }

        # TODO: This is somewhat brittle. Make these constants in trainer.py.
        train_early = "finishing training early!"
        validation_early = "finishing validation early!"
        train_complete = "completed its entire epoch (training)."
        validation_complete = "completed its entire epoch (validation)."

        # There are three shards, but only two workers, so the first worker will have to discard some data.
        with open(os.path.join(out_dir, "stdout_worker0.log")) as f:
            worker0_log = f.read()
            assert train_early in worker0_log
            assert validation_early in worker0_log
            assert train_complete not in worker0_log
            assert validation_complete not in worker0_log

        with open(os.path.join(out_dir, "stdout_worker1.log")) as f:
            worker1_log = f.read()
            assert train_early not in worker1_log
            assert validation_early not in worker1_log
            assert train_complete in worker1_log
            assert validation_complete in worker1_log
コード例 #55
0
    def from_params(cls, params: Params) -> 'Elmo':
        # Add files to archive
        params.add_file_to_archive('options_file')
        params.add_file_to_archive('weight_file')

        options_file = params.pop('options_file')
        weight_file = params.pop('weight_file')
        requires_grad = params.pop('requires_grad', False)
        num_output_representations = params.pop('num_output_representations')
        do_layer_norm = params.pop_bool('do_layer_norm', False)
        keep_sentence_boundaries = params.pop_bool('keep_sentence_boundaries',
                                                   False)
        dropout = params.pop_float('dropout', 0.5)
        scalar_mix_parameters = params.pop('scalar_mix_parameters', None)
        params.assert_empty(cls.__name__)

        return cls(options_file=options_file,
                   weight_file=weight_file,
                   num_output_representations=num_output_representations,
                   requires_grad=requires_grad,
                   do_layer_norm=do_layer_norm,
                   keep_sentence_boundaries=keep_sentence_boundaries,
                   dropout=dropout,
                   scalar_mix_parameters=scalar_mix_parameters)
コード例 #56
0
 def test_can_build_from_params(self):
     reader = SquadReaderV2.from_params(Params({}))
     # pylint: disable=protected-access
     assert reader._tokenizer.__class__.__name__ == 'WordTokenizer'
     assert reader._token_indexers[
         "tokens"].__class__.__name__ == 'SingleIdTokenIndexer'
コード例 #57
0
def debug_vocab(parameter_filename: str,
                serialization_dir: str,
                overrides: str = "",
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    A wrapper around :func:`train_model` which loads the params from a file.

    Parameters
    ----------
    parameter_filename : ``str``
        A json parameter file specifying an AllenNLP experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs. We just pass this along to
        :func:`train_model`.
    overrides : ``str``
        A JSON string that we will use to override values in the input parameter file.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we make our output more friendly to saved model files.  We just pass this
        along to :func:`train_model`.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    force : ``bool``, optional (default=False)
        If ``True``, we will overwrite the serialization directory if it already exists.
    """
    # Load the experiment config from a file and pass it to ``train_model``.
    params = Params.from_file(parameter_filename, overrides)

    prepare_global_logging(serialization_dir, file_friendly_logging)

    check_for_gpu(params.get('trainer').get('cuda_device', -1))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info(
        "From dataset instances, %s will be considered for vocabulary creation.",
        ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    vocab = model.vocab
    vocab_namespace_dict = vocab._token_to_index
    vocab_oov_token = vocab._oov_token
    vocab_non_padded_namespaces = vocab._non_padded_namespaces  # this is a set

    vocab_tokens_dict = vocab_namespace_dict['tokens']
    vocab_labels_dict = vocab_namespace_dict['labels']

    print()
    print("Vocab's OOV token: " + vocab_oov_token)
    print("Non-padded namespaces in vocab: " +
          str(list(vocab_non_padded_namespaces)))
    print()

    print("Number of words in vocab's tokens dict: " +
          str(len(vocab_tokens_dict)))
    if any(
            namespace_match(pattern, 'tokens')
            for pattern in vocab_non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    print("tokens will return True for is_padded: " + str(is_padded))
    print("Vocab's OOV token is in its tokens dict (should be True): " +
          str(vocab_oov_token in vocab_tokens_dict))
    print()

    print("Number of words in vocab's labels dict: " +
          str(len(vocab_labels_dict)))
    if any(
            namespace_match(pattern, 'labels')
            for pattern in vocab_non_padded_namespaces):
        is_padded = False
    else:
        is_padded = True
    print("labels will return True for is_padded: " + str(is_padded))
    print("Vocab's OOV token is in its labels dict (should be False): " +
          str(vocab_oov_token in vocab_labels_dict))
コード例 #58
0
ファイル: train.py プロジェクト: mdunnlp/allennlp
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir,
                          files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info(
            "The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(best_model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return best_model
コード例 #59
0
ファイル: train.py プロジェクト: voidforall/DialSummar
import torch
import torch.optim as optim

reader = MultiwozReader()
print("Reading the delexiclaized data from training set and validation set...")
train_dataset = reader.read("./data/train_delex.json")
valid_dataset = reader.read("./data/valid_delex.json")

print("Building vocabulary from training set and validation set...")
vocab = Vocabulary.from_instances(train_dataset + valid_dataset)
print("Temporary vocabulary has been built.")

params = Params({"token_embedders": {
        "tokens": {
          "type": "embedding",
          "embedding_dim": 128
        }}})
EMBEDDING_DIM = 128
source_embedder = BasicTextFieldEmbedder.from_params(vocab, params=params)

HIDDEN_DIM = 256
encoder1 = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, bidirectional=True, batch_first=True))
encoder2 = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, bidirectional=True, batch_first=True))
attention = DotProductAttention()

print("Use SPNet with default setting...")
model = SPNet(vocab, source_embedder, encoder1, encoder2, attention)
コード例 #60
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> "ElmoTokenEmbedder":  # type: ignore

        params.add_file_to_archive("options_file")
        params.add_file_to_archive("weight_file")
        options_file = params.pop("options_file")
        weight_file = params.pop("weight_file")
        requires_grad = params.pop("requires_grad", False)
        do_layer_norm = params.pop_bool("do_layer_norm", False)
        dropout = params.pop_float("dropout", 0.5)
        namespace_to_cache = params.pop("namespace_to_cache", None)
        if namespace_to_cache is not None:
            vocab_to_cache = list(
                vocab.get_token_to_index_vocabulary(namespace_to_cache).keys())
        else:
            vocab_to_cache = None
        projection_dim = params.pop_int("projection_dim", None)
        scalar_mix_parameters = params.pop("scalar_mix_parameters", None)
        params.assert_empty(cls.__name__)
        return cls(
            options_file=options_file,
            weight_file=weight_file,
            do_layer_norm=do_layer_norm,
            dropout=dropout,
            requires_grad=requires_grad,
            projection_dim=projection_dim,
            vocab_to_cache=vocab_to_cache,
            scalar_mix_parameters=scalar_mix_parameters,
        )