Exemple #1
0
    def __init__(
        self,
        serialization_dir: str,
        cuda_device: int = -1,
        distributed: bool = False,
        rank: int = 0,
        world_size: int = 1,
    ) -> None:

        check_for_gpu(cuda_device)
        self._serialization_dir = serialization_dir

        if isinstance(cuda_device, list):
            raise ConfigurationError(
                "In allennlp 1.0, the Trainer can only be assigned a single `cuda_device`. "
                "Instead, we use torch's DistributedDataParallel at the command level, meaning "
                "our Trainer always uses a single GPU per process.")

        if not isinstance(cuda_device, int):
            raise ConfigurationError(
                "Expected an int for cuda_device, got {}".format(cuda_device))

        if distributed and world_size <= 1:
            raise ConfigurationError(
                "Distributed training can be performed only with more than 1 GPU device. Check "
                "`cuda_device` key in the experiment configuration.")

        self.cuda_device = cuda_device

        self._distributed = distributed
        self._rank = rank
        self._master = is_master()
        self._world_size = world_size
Exemple #2
0
    def from_partial_objects(
        cls,
        serialization_dir: str,
        local_rank: int,
        batch_weight_key: str,
        dataset_reader: DatasetReader,
        train_data_path: str,
        model: Lazy[Model],
        iterator: DataIterator,
        trainer: Lazy[TrainerBase],
        vocabulary: Lazy[Vocabulary] = None,
        datasets_for_vocab_creation: List[str] = None,
        validation_dataset_reader: DatasetReader = None,
        validation_data_path: str = None,
        validation_iterator: DataIterator = None,
        test_data_path: str = None,
        evaluate_on_test: bool = False,
    ) -> "TrainModel":
        """
        This method is intended for use with our `FromParams` logic, to construct a `TrainModel`
        object from a config file passed to the `allennlp train` command.  The arguments to this
        method are the allowed top-level keys in a configuration file (except for the first three,
        which are obtained separately).

        You *could* use this outside of our `FromParams` logic if you really want to, but there
        might be easier ways to accomplish your goal than instantiating `Lazy` objects.  If you are
        writing your own training loop, we recommend that you look at the implementation of this
        method for inspiration and possibly some utility functions you can call, but you very likely
        should not use this method directly.

        The `Lazy` type annotations here are a mechanism for building dependencies to an object
        sequentially - the `TrainModel` object needs data, a model, and a trainer, but the model
        needs to see the data before it's constructed (to create a vocabulary) and the trainer needs
        the data and the model before it's constructed.  Objects that have sequential dependencies
        like this are labeled as `Lazy` in their type annotations, and we pass the missing
        dependencies when we call their `construct()` method, which you can see in the code below.

        # Parameters
        serialization_dir: `str`
            The directory where logs and model archives will be saved.
        local_rank: `int`
            The process index that is initialized using the GPU device id.
        batch_weight_key: `str`
            The name of metric used to weight the loss on a per-batch basis.
        dataset_reader: `DatasetReader`
            The `DatasetReader` that will be used for training and (by default) for validation.
        train_data_path: `str`
            The file (or directory) that will be passed to `dataset_reader.read()` to construct the
            training data.
        model: `Lazy[Model]`
            The model that we will train.  This is lazy because it depends on the `Vocabulary`;
            after constructing the vocabulary we call `model.construct(vocab=vocabulary)`.
        iterator: `DataIterator`
            The iterator we use to batch instances from the dataset reader at training and (by
            default) validation time.
        trainer: `Lazy[TrainerBase]`
            The `Trainer` that actually implements the training loop.  This is a lazy object because
            it depends on the model that's going to be trained.
        vocabulary: `Lazy[Vocabulary]`, optional (default=None)
            The `Vocabulary` that we will use to convert strings in the data to integer ids (and
            possibly set sizes of embedding matrices in the `Model`).  By default we construct the
            vocabulary from the instances that we read.
        datasets_for_vocab_creation: `List[str]`, optional (default=None)
            If you pass in more than one dataset but don't want to use all of them to construct a
            vocabulary, you can pass in this key to limit it.  Valid entries in the list are
            "train", "validation" and "test".
        validation_dataset_reader: `DatasetReader`, optional (default=None)
            If given, we will use this dataset reader for the validation data instead of
            `dataset_reader`.
        validation_data_path: `str`, optional (default=None)
            If given, we will use this data for computing validation metrics and early stopping.
        validation_iterator: `DataIterator`, optional (default=None)
            If given, we will use this iterator for batching and scheduling instances for the
            validation data, instead of `iterator`.
        test_data_path: `str`, optional (default=None)
            If given, we will use this as test data.  This makes it available for vocab creation by
            default, but nothing else.
        evaluate_on_test: `bool`, optional (default=False)
            If given, we will evaluate the final model on this data at the end of training.  Note
            that we do not recommend using this for actual test data in every-day experimentation;
            you should only very rarely evaluate your model on actual test data.
        """

        datasets = training_util.read_all_datasets(
            train_data_path=train_data_path,
            dataset_reader=dataset_reader,
            validation_dataset_reader=validation_dataset_reader,
            validation_data_path=validation_data_path,
            test_data_path=test_data_path,
        )

        if datasets_for_vocab_creation:
            for key in datasets_for_vocab_creation:
                if key not in datasets:
                    raise ConfigurationError(
                        f"invalid 'dataset_for_vocab_creation' {key}")

        instance_generator = (instance for key, dataset in datasets.items()
                              if not datasets_for_vocab_creation
                              or key in datasets_for_vocab_creation
                              for instance in dataset)

        vocabulary_ = vocabulary.construct(instances=instance_generator)
        if not vocabulary_:
            vocabulary_ = Vocabulary.from_instances(instance_generator)
        model_ = model.construct(vocab=vocabulary_)

        # Initializing the model can have side effect of expanding the vocabulary.
        # Save the vocab only in the master. In the degenerate non-distributed
        # case, we're trivially the master.
        if common_util.is_master():
            vocabulary_path = os.path.join(serialization_dir, "vocabulary")
            vocabulary_.save_to_files(vocabulary_path)

        iterator.index_with(model_.vocab)
        validation_iterator = validation_iterator or iterator
        validation_iterator.index_with(
            model_.vocab)  # it is ok to call this twice

        # We don't need to pass serialization_dir and local_rank here, because they will have been
        # passed through the trainer by from_params already, because they were keyword arguments to
        # construct this class in the first place.
        trainer_ = trainer.construct(
            model=model_,
            iterator=iterator,
            train_data=datasets["train"],
            validation_iterator=validation_iterator,
            validation_data=datasets.get("validation"),
        )

        return cls(
            serialization_dir=serialization_dir,
            model=model_,
            trainer=trainer_,
            evaluation_dataset=datasets.get("test"),
            evaluation_iterator=validation_iterator,
            evaluate_on_test=evaluate_on_test,
            batch_weight_key=batch_weight_key,
        )
Exemple #3
0
    def from_params(
            cls,
            params: Params,
            serialization_dir: str,
            recover: bool = False,
            cache_directory: str = None,
            cache_prefix: str = None,
    ) -> "TrainerPieces":
        all_datasets = training_util.meta_dataset_from_params(params, cache_directory, cache_prefix)
        datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

        for dataset in datasets_for_vocab_creation:
            if dataset not in all_datasets:
                raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

        logger.info(
            "From dataset instances, %s will be considered for vocabulary creation.",
            ", ".join(datasets_for_vocab_creation),
        )

        if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")):
            vocab_params = params.pop("vocabulary", {})
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"),
                vocab_params.get("padding_token", None),
                vocab_params.get("oov_token", None),
            )
        else:
            instance_train = (
                instance
                for key, dataset in all_datasets.items()
                if key == 'train'
                for subdata in dataset
                for instance in subdata
            )
            instance_valid_test = (
                instance
                for key, dataset in all_datasets.items()
                if key != 'train'
                for instance in dataset
            )
            instances = chain(instance_train, instance_valid_test)
            vocab = Vocabulary.from_params(
                params.pop("vocabulary", {}),
                # Using a generator comprehension here is important
                # because, being lazy, it allows us to not iterate over the
                # dataset when directory_path is specified.

                # (
                #     instance
                #     for key, dataset in all_datasets.items()
                #     if (key in datasets_for_vocab_creation)
                #     for instance in dataset
                # ),
                instances
            )

        model = Model.from_params(vocab=vocab, params=params.pop("model"))

        # If vocab extension is ON for training, embedding extension should also be
        # done. If vocab and embeddings are already in sync, it would be a no-op.
        model.extend_embedder_vocab()

        # Initializing the model can have side effect of expanding the vocabulary
        # Save the vocab only in the master
        if not is_distributed() or is_master():
            vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        # print('[info] iterator in meta_pieces is:{}'.format(params.pop("iterator")))
        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_data = all_datasets["train"]
        validation_data = all_datasets.get("validation")
        test_data = all_datasets.get("test")

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(
            model
        )
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        return cls(
            model,
            iterator,
            train_data,
            validation_data,
            test_data,
            validation_iterator,
            trainer_params,
        )
    def from_params(
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        model: Model = None,
        embedding_sources_mapping: Dict[str, str] = None,
        extend_vocab: bool = False,
    ) -> "MetaTrainerPieces":
        all_datasets = training_util.datasets_from_params(params)

        vocabulary_params = params.pop("vocabulary", {})

        if model:
            if params.pop("model", None):
                logger.warning(
                    "You passed parameters for the model in your configuration file, but we "
                    "are ignoring them, using instead the loaded model parameters."
                )

            # TODO(mattg): This should be updated now that directory_path no longer exists.
            if vocabulary_params.get("directory_path", None):
                logger.warning(
                    "You passed `directory_path` in parameters for the vocabulary in "
                    "your configuration file, but it will be ignored because we already "
                    "have a model with a vocabulary.")

            vocab = model.vocab
        else:
            vocab = None

        vocabulary_path = os.path.join(serialization_dir, "vocabulary")

        if not vocab or extend_vocab:
            vocab = MetaTrainerPieces.create_or_extend_vocab(
                datasets=all_datasets,
                params=params,
                recover=recover,
                vocab=vocab,
                vocabulary_params=vocabulary_params,
                vocabulary_path=vocabulary_path,
            )

            if not model:
                model = Model.from_params(vocab=vocab,
                                          params=params.pop("model"))

            # If vocab extension is ON for training, embedding extension should also be
            # done. If vocab and embeddings are already in sync, it would be a no-op.
            model.extend_embedder_vocab(embedding_sources_mapping)

        # Initializing the model can have side effect of expanding the vocabulary
        # Save the vocab only in the master. In the degenerate non-distributed
        # case, we're trivially the master.
        if is_master():
            vocab.save_to_files(vocabulary_path)

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        train_datas = all_datasets["train"]
        validation_datas = all_datasets.get("validation")
        test_datas = all_datasets.get("test")

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        log_frozen_and_tunable_parameter_names(model)

        return cls(
            model=model,
            iterator=iterator,
            train_datasets=train_datas,
            validation_datasets=validation_datas,
            test_datasets=test_datas,
            validation_iterator=validation_iterator,
            params=trainer_params,
        )