Esempio n. 1
0
    def from_partial_objects(
        cls,
        serialization_dir: str,
        local_rank: int,
        batch_weight_key: str,
        dataset_reader: DatasetReader,
        train_data_path: str,
        model: Lazy[Model],
        iterator: DataIterator,
        trainer: Lazy[TrainerBase],
        vocabulary: Lazy[Vocabulary] = None,
        datasets_for_vocab_creation: List[str] = None,
        validation_dataset_reader: DatasetReader = None,
        validation_data_path: str = None,
        validation_iterator: DataIterator = None,
        test_data_path: str = None,
        evaluate_on_test: bool = False,
    ) -> "TrainModel":
        """
        This method is intended for use with our `FromParams` logic, to construct a `TrainModel`
        object from a config file passed to the `allennlp train` command.  The arguments to this
        method are the allowed top-level keys in a configuration file (except for the first three,
        which are obtained separately).

        You *could* use this outside of our `FromParams` logic if you really want to, but there
        might be easier ways to accomplish your goal than instantiating `Lazy` objects.  If you are
        writing your own training loop, we recommend that you look at the implementation of this
        method for inspiration and possibly some utility functions you can call, but you very likely
        should not use this method directly.

        The `Lazy` type annotations here are a mechanism for building dependencies to an object
        sequentially - the `TrainModel` object needs data, a model, and a trainer, but the model
        needs to see the data before it's constructed (to create a vocabulary) and the trainer needs
        the data and the model before it's constructed.  Objects that have sequential dependencies
        like this are labeled as `Lazy` in their type annotations, and we pass the missing
        dependencies when we call their `construct()` method, which you can see in the code below.

        # Parameters
        serialization_dir: `str`
            The directory where logs and model archives will be saved.
        local_rank: `int`
            The process index that is initialized using the GPU device id.
        batch_weight_key: `str`
            The name of metric used to weight the loss on a per-batch basis.
        dataset_reader: `DatasetReader`
            The `DatasetReader` that will be used for training and (by default) for validation.
        train_data_path: `str`
            The file (or directory) that will be passed to `dataset_reader.read()` to construct the
            training data.
        model: `Lazy[Model]`
            The model that we will train.  This is lazy because it depends on the `Vocabulary`;
            after constructing the vocabulary we call `model.construct(vocab=vocabulary)`.
        iterator: `DataIterator`
            The iterator we use to batch instances from the dataset reader at training and (by
            default) validation time.
        trainer: `Lazy[TrainerBase]`
            The `Trainer` that actually implements the training loop.  This is a lazy object because
            it depends on the model that's going to be trained.
        vocabulary: `Lazy[Vocabulary]`, optional (default=None)
            The `Vocabulary` that we will use to convert strings in the data to integer ids (and
            possibly set sizes of embedding matrices in the `Model`).  By default we construct the
            vocabulary from the instances that we read.
        datasets_for_vocab_creation: `List[str]`, optional (default=None)
            If you pass in more than one dataset but don't want to use all of them to construct a
            vocabulary, you can pass in this key to limit it.  Valid entries in the list are
            "train", "validation" and "test".
        validation_dataset_reader: `DatasetReader`, optional (default=None)
            If given, we will use this dataset reader for the validation data instead of
            `dataset_reader`.
        validation_data_path: `str`, optional (default=None)
            If given, we will use this data for computing validation metrics and early stopping.
        validation_iterator: `DataIterator`, optional (default=None)
            If given, we will use this iterator for batching and scheduling instances for the
            validation data, instead of `iterator`.
        test_data_path: `str`, optional (default=None)
            If given, we will use this as test data.  This makes it available for vocab creation by
            default, but nothing else.
        evaluate_on_test: `bool`, optional (default=False)
            If given, we will evaluate the final model on this data at the end of training.  Note
            that we do not recommend using this for actual test data in every-day experimentation;
            you should only very rarely evaluate your model on actual test data.
        """

        datasets = training_util.read_all_datasets(
            train_data_path=train_data_path,
            dataset_reader=dataset_reader,
            validation_dataset_reader=validation_dataset_reader,
            validation_data_path=validation_data_path,
            test_data_path=test_data_path,
        )

        if datasets_for_vocab_creation:
            for key in datasets_for_vocab_creation:
                if key not in datasets:
                    raise ConfigurationError(
                        f"invalid 'dataset_for_vocab_creation' {key}")

        instance_generator = (instance for key, dataset in datasets.items()
                              if not datasets_for_vocab_creation
                              or key in datasets_for_vocab_creation
                              for instance in dataset)

        vocabulary_ = vocabulary.construct(instances=instance_generator)
        if not vocabulary_:
            vocabulary_ = Vocabulary.from_instances(instance_generator)
        model_ = model.construct(vocab=vocabulary_)

        # Initializing the model can have side effect of expanding the vocabulary.
        # Save the vocab only in the master. In the degenerate non-distributed
        # case, we're trivially the master.
        if common_util.is_master():
            vocabulary_path = os.path.join(serialization_dir, "vocabulary")
            vocabulary_.save_to_files(vocabulary_path)

        iterator.index_with(model_.vocab)
        validation_iterator = validation_iterator or iterator
        validation_iterator.index_with(
            model_.vocab)  # it is ok to call this twice

        # We don't need to pass serialization_dir and local_rank here, because they will have been
        # passed through the trainer by from_params already, because they were keyword arguments to
        # construct this class in the first place.
        trainer_ = trainer.construct(
            model=model_,
            iterator=iterator,
            train_data=datasets["train"],
            validation_iterator=validation_iterator,
            validation_data=datasets.get("validation"),
        )

        return cls(
            serialization_dir=serialization_dir,
            model=model_,
            trainer=trainer_,
            evaluation_dataset=datasets.get("test"),
            evaluation_iterator=validation_iterator,
            evaluate_on_test=evaluate_on_test,
            batch_weight_key=batch_weight_key,
        )
Esempio n. 2
0
def forward_on_instances(
        model, instances: Iterable[Instance],
        data_iterator: DataIterator) -> List[Dict[str, numpy.ndarray]]:
    """
    Basically a copy of Model.forward_on_instances, but also takes a DataIterator in order to be more efficient.


    Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
    arrays using this model's :class:`Vocabulary`, passes those arrays through
    :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
    and returns the result.  Before returning the result, we convert any
    ``torch.Tensors`` into numpy arrays and separate the
    batched output into a list of individual dicts per instance. Note that typically
    this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
    :func:`forward_on_instance`.

    Parameters
    ----------
    model : AllenNLP model, required
        The model to run.
    instances : List[Instance], required
        The instances to run the model on.
    data_iterator: DataIterator, required
        The DataIterator used for going over the data (e.g. BucketIterator)

    Returns
    -------
    A list of the models output for each instance.
    """
    data_iterator.index_with(model.vocab)
    with torch.no_grad():
        return_val: List[Dict[str, numpy.ndarray]] = []
        cuda_device = model._get_prediction_device()
        for dataset in data_iterator._create_batches(instances, shuffle=False):
            batch_size = len(dataset.instances)
            dataset.index_instances(model.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            outputs = model.decode(model(**model_input))
            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        model._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    model._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return_val.extend(instance_separated_output)
        return return_val