def from_partial_objects( cls, serialization_dir: str, local_rank: int, batch_weight_key: str, dataset_reader: DatasetReader, train_data_path: str, model: Lazy[Model], iterator: DataIterator, trainer: Lazy[TrainerBase], vocabulary: Lazy[Vocabulary] = None, datasets_for_vocab_creation: List[str] = None, validation_dataset_reader: DatasetReader = None, validation_data_path: str = None, validation_iterator: DataIterator = None, test_data_path: str = None, evaluate_on_test: bool = False, ) -> "TrainModel": """ This method is intended for use with our `FromParams` logic, to construct a `TrainModel` object from a config file passed to the `allennlp train` command. The arguments to this method are the allowed top-level keys in a configuration file (except for the first three, which are obtained separately). You *could* use this outside of our `FromParams` logic if you really want to, but there might be easier ways to accomplish your goal than instantiating `Lazy` objects. If you are writing your own training loop, we recommend that you look at the implementation of this method for inspiration and possibly some utility functions you can call, but you very likely should not use this method directly. The `Lazy` type annotations here are a mechanism for building dependencies to an object sequentially - the `TrainModel` object needs data, a model, and a trainer, but the model needs to see the data before it's constructed (to create a vocabulary) and the trainer needs the data and the model before it's constructed. Objects that have sequential dependencies like this are labeled as `Lazy` in their type annotations, and we pass the missing dependencies when we call their `construct()` method, which you can see in the code below. # Parameters serialization_dir: `str` The directory where logs and model archives will be saved. local_rank: `int` The process index that is initialized using the GPU device id. batch_weight_key: `str` The name of metric used to weight the loss on a per-batch basis. dataset_reader: `DatasetReader` The `DatasetReader` that will be used for training and (by default) for validation. train_data_path: `str` The file (or directory) that will be passed to `dataset_reader.read()` to construct the training data. model: `Lazy[Model]` The model that we will train. This is lazy because it depends on the `Vocabulary`; after constructing the vocabulary we call `model.construct(vocab=vocabulary)`. iterator: `DataIterator` The iterator we use to batch instances from the dataset reader at training and (by default) validation time. trainer: `Lazy[TrainerBase]` The `Trainer` that actually implements the training loop. This is a lazy object because it depends on the model that's going to be trained. vocabulary: `Lazy[Vocabulary]`, optional (default=None) The `Vocabulary` that we will use to convert strings in the data to integer ids (and possibly set sizes of embedding matrices in the `Model`). By default we construct the vocabulary from the instances that we read. datasets_for_vocab_creation: `List[str]`, optional (default=None) If you pass in more than one dataset but don't want to use all of them to construct a vocabulary, you can pass in this key to limit it. Valid entries in the list are "train", "validation" and "test". validation_dataset_reader: `DatasetReader`, optional (default=None) If given, we will use this dataset reader for the validation data instead of `dataset_reader`. validation_data_path: `str`, optional (default=None) If given, we will use this data for computing validation metrics and early stopping. validation_iterator: `DataIterator`, optional (default=None) If given, we will use this iterator for batching and scheduling instances for the validation data, instead of `iterator`. test_data_path: `str`, optional (default=None) If given, we will use this as test data. This makes it available for vocab creation by default, but nothing else. evaluate_on_test: `bool`, optional (default=False) If given, we will evaluate the final model on this data at the end of training. Note that we do not recommend using this for actual test data in every-day experimentation; you should only very rarely evaluate your model on actual test data. """ datasets = training_util.read_all_datasets( train_data_path=train_data_path, dataset_reader=dataset_reader, validation_dataset_reader=validation_dataset_reader, validation_data_path=validation_data_path, test_data_path=test_data_path, ) if datasets_for_vocab_creation: for key in datasets_for_vocab_creation: if key not in datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {key}") instance_generator = (instance for key, dataset in datasets.items() if not datasets_for_vocab_creation or key in datasets_for_vocab_creation for instance in dataset) vocabulary_ = vocabulary.construct(instances=instance_generator) if not vocabulary_: vocabulary_ = Vocabulary.from_instances(instance_generator) model_ = model.construct(vocab=vocabulary_) # Initializing the model can have side effect of expanding the vocabulary. # Save the vocab only in the master. In the degenerate non-distributed # case, we're trivially the master. if common_util.is_master(): vocabulary_path = os.path.join(serialization_dir, "vocabulary") vocabulary_.save_to_files(vocabulary_path) iterator.index_with(model_.vocab) validation_iterator = validation_iterator or iterator validation_iterator.index_with( model_.vocab) # it is ok to call this twice # We don't need to pass serialization_dir and local_rank here, because they will have been # passed through the trainer by from_params already, because they were keyword arguments to # construct this class in the first place. trainer_ = trainer.construct( model=model_, iterator=iterator, train_data=datasets["train"], validation_iterator=validation_iterator, validation_data=datasets.get("validation"), ) return cls( serialization_dir=serialization_dir, model=model_, trainer=trainer_, evaluation_dataset=datasets.get("test"), evaluation_iterator=validation_iterator, evaluate_on_test=evaluate_on_test, batch_weight_key=batch_weight_key, )
def forward_on_instances( model, instances: Iterable[Instance], data_iterator: DataIterator) -> List[Dict[str, numpy.ndarray]]: """ Basically a copy of Model.forward_on_instances, but also takes a DataIterator in order to be more efficient. Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. Parameters ---------- model : AllenNLP model, required The model to run. instances : List[Instance], required The instances to run the model on. data_iterator: DataIterator, required The DataIterator used for going over the data (e.g. BucketIterator) Returns ------- A list of the models output for each instance. """ data_iterator.index_with(model.vocab) with torch.no_grad(): return_val: List[Dict[str, numpy.ndarray]] = [] cuda_device = model._get_prediction_device() for dataset in data_iterator._create_batches(instances, shuffle=False): batch_size = len(dataset.instances) dataset.index_instances(model.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = model.decode(model(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable. # This occurs with batch size 1, because we still want to include the loss in that case. if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: model._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: model._maybe_warn_for_unseparable_batches(name) continue for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return_val.extend(instance_separated_output) return return_val