Ejemplo n.º 1
0
    def from_archive(
        cls,
        archive: Archive,
        *,
        interpreter_name: Optional[str] = None,
        train_data_path: Optional[DatasetReaderInput] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: Optional[List[str]] = None,
        cuda_device: int = -1,
        **extras,
    ) -> "InfluenceInterpreter":
        """
        Load an `InfluenceInterpreter` from an `Archive`.

        The other parameters are the same as `.from_path()`.
        """
        interpreter_cls = cls.by_name(interpreter_name
                                      or cls.default_implementation)
        return interpreter_cls(
            model=archive.model,
            train_data_path=train_data_path
            or archive.config["train_data_path"],
            train_dataset_reader=archive.dataset_reader,
            test_dataset_reader=archive.validation_dataset_reader,
            train_data_loader=train_data_loader,
            test_data_loader=test_data_loader,
            params_to_freeze=params_to_freeze,
            cuda_device=cuda_device,
            **extras,
        )
Ejemplo n.º 2
0
Archivo: t5.py Proyecto: himkt/allennlp
 def basic_encoder(
     cls,
     token_embeddings: nn.Embedding,
     num_blocks: int = 6,
     block_self_attention: Lazy[T5Attention] = Lazy(T5Attention),
     final_layer_norm: Optional[T5LayerNorm] = None,
     block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF),
     dropout: float = 0.1,
     ddp_accelerator: Optional[DdpAccelerator] = None,
     checkpoint_wrapper: Optional[CheckpointWrapper] = None,
 ) -> "T5EncoderStack":
     if ddp_accelerator is not None:
         logger.info("Initializing T5 encoder with DdpAccelerator %s",
                     ddp_accelerator)
     blocks: List[T5Block] = []
     for i in range(num_blocks):
         block = T5Block(
             attention=T5LayerSelfAttention(
                 self_attention=block_self_attention.construct(
                     is_decoder=False, has_relative_attention_bias=(
                         i == 0))),
             cross_attention=None,
             ff=block_ff.construct(),
         )
         if checkpoint_wrapper is not None:
             block = checkpoint_wrapper.wrap_module(block)
         if ddp_accelerator is not None:
             block = ddp_accelerator.wrap_module(block)
         blocks.append(block)
     return cls(token_embeddings,
                blocks,
                final_layer_norm=final_layer_norm,
                dropout=dropout)
Ejemplo n.º 3
0
    def from_path(
        cls,
        archive_path: Union[str, PathLike],
        *,
        interpreter_name: Optional[str] = None,
        train_data_path: Optional[DatasetReaderInput] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: Optional[List[str]] = None,
        cuda_device: int = -1,
        import_plugins: bool = True,
        overrides: Union[str, Dict[str, Any]] = "",
        **extras,
    ) -> "InfluenceInterpreter":
        """
        Load an `InfluenceInterpreter` from an archive path.

        # Parameters

        archive_path : `Union[str, PathLike]`, required
            The path to the archive file.
        interpreter_name : `Optional[str]`, optional (default = `None`)
            The registered name of the an interpreter class. If not specified,
            the default implementation (`SimpleInfluence`) will be used.
        train_data_path : `Optional[DatasetReaderInput]`, optional (default = `None`)
            If not specified, `train_data_path` will be taken from the archive's config.
        train_data_loader : `Lazy[DataLoader]`, optional (default = `Lazy(SimpleDataLoader)`)
        test_data_loader : `Lazy[DataLoader]`, optional (default = `Lazy(SimpleDataLoader)`)
        params_to_freeze : `Optional[List[str]]`, optional (default = `None`)
        cuda_device : `int`, optional (default = `-1`)
        import_plugins : `bool`, optional (default = `True`)
            If `True`, we attempt to import plugins before loading the `InfluenceInterpreter`.
            This comes with additional overhead, but means you don't need to explicitly
            import the modules that your implementation depends on as long as those modules
            can be found by `allennlp.common.plugins.import_plugins()`.
        overrides : `Union[str, Dict[str, Any]]`, optional (default = `""`)
            JSON overrides to apply to the unarchived `Params` object.
        **extras : `Any`
            Extra parameters to pass to the interpreter's `__init__()` method.

        """
        if import_plugins:
            plugins.import_plugins()
        return cls.from_archive(
            load_archive(archive_path,
                         cuda_device=cuda_device,
                         overrides=overrides),
            interpreter_name=interpreter_name,
            train_data_path=train_data_path,
            train_data_loader=train_data_loader,
            test_data_loader=test_data_loader,
            params_to_freeze=params_to_freeze,
            cuda_device=cuda_device,
            **extras,
        )
Ejemplo n.º 4
0
    def test_from_params_in_trainer(self):
        # This is more of an integration test, making sure that a bunch of pieces fit together
        # correctly, but it matters most for this learning rate scheduler, so we're testing it here.
        params = Params({
            "num_epochs": 5,
            "learning_rate_scheduler": {
                "type": "slanted_triangular",
                "gradual_unfreezing": True,
                "discriminative_fine_tuning": True,
                "decay_factor": 0.5,
            },
        })
        # The method called in the logic below only checks the length of this list, not its
        # contents, so this should be safe.
        instances = [
            1,
        ] * 40  # noqa: E231, flake doesn't like what black does with this list
        optim = self._get_optimizer()
        trainer = TrainerBase.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            iterator=BasicIterator(batch_size=10),
            train_data=instances,
        )
        assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular)

        # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and
        # that num_steps_per_epoch is computed and passed correctly.  This logic happens inside of
        # `Trainer.from_partial_objects`.
        assert trainer._learning_rate_scheduler.num_epochs == 5
        assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4

        # And we'll do one more to make sure that we can override num_epochs in the scheduler if we
        # really want to.  Not sure why you would ever want to in this case; this is just testing
        # the functionality.
        params = Params({
            "num_epochs": 5,
            "learning_rate_scheduler": {
                "type": "slanted_triangular",
                "num_epochs": 3,
                "gradual_unfreezing": True,
                "discriminative_fine_tuning": True,
                "decay_factor": 0.5,
            },
        })
        trainer = TrainerBase.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            iterator=BasicIterator(batch_size=10),
            train_data=instances,
        )
        assert trainer._learning_rate_scheduler.num_epochs == 3
Ejemplo n.º 5
0
 def __init__(
     self,
     lazy1: Lazy[ConstructedObject],
     lazy2: Lazy[ConstructedObject] = Lazy(ConstructedObject),
     lazy3: Lazy[ConstructedObject] = None,
     lazy4: Optional[Lazy[ConstructedObject]] = Lazy(ConstructedObject),
 ) -> None:
     self.lazy1 = lazy1.construct()
     self.lazy2 = lazy2.construct(a=2)
     self.lazy3 = None if lazy3 is None else lazy3.construct()
     self.lazy4 = None if lazy4 is None else lazy4.construct(a=1)
Ejemplo n.º 6
0
Archivo: t5.py Proyecto: himkt/allennlp
    def __init__(
        self,
        token_embeddings: Optional[nn.Embedding] = None,
        encoder: Lazy[T5EncoderStack] = Lazy(T5EncoderStack),
        decoder: Lazy[T5DecoderStack] = Lazy(T5DecoderStack),
        decoder_start_token_id: int = 0,
        pad_token_id:
        int = 0,  # These are both 0 in t5-(small|base|large). Go figure.
        eos_token_id: int = 1,
        vocab_size: int = 32128,
        model_dim: int = 512,
        output_attentions: bool = False,
        output_all_hidden_states: bool = False,
        beam_search: Lazy[BeamSearch] = Lazy(BeamSearch,
                                             beam_size=3,
                                             max_steps=100),
        ddp_accelerator: Optional[DdpAccelerator] = None,
        checkpoint_wrapper: Optional[CheckpointWrapper] = None,
        tie_word_embeddings: bool = True,
    ):
        super().__init__()
        self._tie_word_embeddings = tie_word_embeddings

        self.model_dim = model_dim
        self.token_embeddings = token_embeddings or nn.Embedding(
            vocab_size, model_dim)
        if token_embeddings is None:
            self.token_embeddings.weight.data.normal_(mean=0.0, std=1.0)
        self.encoder: T5EncoderStack = encoder.construct(
            token_embeddings=self.token_embeddings,
            ddp_accelerator=ddp_accelerator,
            checkpoint_wrapper=checkpoint_wrapper,
        )
        self.decoder: T5DecoderStack = decoder.construct(
            token_embeddings=self.token_embeddings,
            ddp_accelerator=ddp_accelerator,
            checkpoint_wrapper=checkpoint_wrapper,
        )
        self.lm_head = nn.Linear(self.decoder.hidden_size,
                                 self.token_embeddings.num_embeddings,
                                 bias=False)
        if self._tie_word_embeddings:
            self.lm_head.weight = self.token_embeddings.weight

        self.loss_fct = CrossEntropyLoss(ignore_index=-100)

        self.decoder_start_token_id = decoder_start_token_id
        self.pad_token_id = pad_token_id
        self.eos_token_id = eos_token_id
        self.output_attentions = output_attentions
        self.output_all_hidden_states = output_all_hidden_states

        self.beam_search = beam_search.construct(end_index=self.eos_token_id)
Ejemplo n.º 7
0
    def __init__(
        self,
        model: Model,
        train_data_path: DatasetReaderInput,
        train_dataset_reader: DatasetReader,
        *,
        test_dataset_reader: Optional[DatasetReader] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: List[str] = None,
        cuda_device: int = -1,
        lissa_batch_size: int = 8,
        damping: float = 3e-3,
        num_samples: int = 1,
        recursion_depth: Union[float, int] = 0.25,
        scale: float = 1e4,
    ) -> None:
        super().__init__(
            model=model,
            train_data_path=train_data_path,
            train_dataset_reader=train_dataset_reader,
            test_dataset_reader=test_dataset_reader,
            train_data_loader=train_data_loader,
            test_data_loader=test_data_loader,
            params_to_freeze=params_to_freeze,
            cuda_device=cuda_device,
        )

        self._lissa_dataloader = SimpleDataLoader(
            list(self._train_loader.iter_instances()),
            lissa_batch_size,
            shuffle=True,
            vocab=self.vocab,
        )
        self._lissa_dataloader.set_target_device(self.device)
        if isinstance(recursion_depth, float) and recursion_depth > 0.0:
            self._lissa_dataloader.batches_per_epoch = int(
                len(self._lissa_dataloader) * recursion_depth)
        elif isinstance(recursion_depth, int) and recursion_depth > 0:
            self._lissa_dataloader.batches_per_epoch = recursion_depth
        else:
            raise ValueError(
                "'recursion_depth' should be a positive int or float")

        self._damping = damping
        self._num_samples = num_samples
        self._recursion_depth = recursion_depth
        self._scale = scale
Ejemplo n.º 8
0
    def __init__(
        self,
        model: Model,
        train_data_path: DatasetReaderInput,
        train_dataset_reader: DatasetReader,
        *,
        test_dataset_reader: Optional[DatasetReader] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: Optional[List[str]] = None,
        cuda_device: int = -1,
    ) -> None:
        self.model = model
        self.vocab = model.vocab
        self.device = int_to_device(cuda_device)

        self._train_data_path = train_data_path
        self._train_loader = train_data_loader.construct(
            reader=train_dataset_reader,
            data_path=train_data_path,
            batch_size=1,
        )
        self._train_loader.set_target_device(self.device)
        self._train_loader.index_with(self.vocab)

        self._test_dataset_reader = test_dataset_reader or train_dataset_reader
        self._lazy_test_data_loader = test_data_loader

        self.model.to(self.device)
        if params_to_freeze is not None:
            for name, param in self.model.named_parameters():
                if any(
                    [re.match(pattern, name) for pattern in params_to_freeze]):
                    param.requires_grad = False

        # These variables are set when the corresponding public properties are accessed.
        # This is not set until we actually run the calculation since some parameters might not be used.
        self._used_params: Optional[List[torch.nn.Parameter]] = None
        self._used_param_names: Optional[List[str]] = None
        self._train_instances: Optional[List[InstanceWithGrads]] = None
    def test_auto_regressive_seq_decoder_init(self):
        decoder_inout_dim = 4
        vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim)

        AutoRegressiveSeqDecoder(
            vocab,
            decoder_net,
            Embedding(num_embeddings=vocab.get_vocab_size(),
                      embedding_dim=decoder_inout_dim),
            beam_search=Lazy(BeamSearch, constructor_extras={"max_steps": 10}),
        )

        with pytest.raises(ConfigurationError):
            AutoRegressiveSeqDecoder(
                vocab,
                decoder_net,
                Embedding(num_embeddings=vocab.get_vocab_size(),
                          embedding_dim=decoder_inout_dim + 1),
                beam_search=Lazy(BeamSearch,
                                 constructor_extras={"max_steps": 10}),
            )
    def test_auto_regressive_seq_decoder_indices_to_tokens(self):
        decoder_inout_dim = 4
        vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim)

        auto_regressive_seq_decoder = AutoRegressiveSeqDecoder(
            vocab,
            decoder_net,
            Embedding(num_embeddings=vocab.get_vocab_size(),
                      embedding_dim=decoder_inout_dim),
            beam_search=Lazy(BeamSearch, constructor_extras={"max_steps": 10}),
        )

        predictions = torch.tensor([[3, 2, 5, 0, 0], [2, 2, 3, 5, 0]])

        tokens_ground_truth = [["B", "A"], ["A", "A", "B"]]
        predicted_tokens = auto_regressive_seq_decoder.indices_to_tokens(
            predictions.numpy())
        assert predicted_tokens == tokens_ground_truth
    def test_auto_regressive_seq_decoder_tensor_and_token_based_metric(self):
        # set all seeds to a fixed value (torch, numpy, etc.).
        # this enable a deterministic behavior of the `auto_regressive_seq_decoder`
        # below (i.e., parameter initialization and `encoded_state = torch.randn(..)`)
        prepare_environment(Params({}))

        batch_size, time_steps, decoder_inout_dim = 2, 3, 4
        vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim)

        auto_regressive_seq_decoder = AutoRegressiveSeqDecoder(
            vocab,
            decoder_net,
            Embedding(num_embeddings=vocab.get_vocab_size(),
                      embedding_dim=decoder_inout_dim),
            beam_search=Lazy(BeamSearch,
                             constructor_extras={
                                 "max_steps": 10,
                                 "beam_size": 4
                             }),
            tensor_based_metric=BLEU(),
            token_based_metric=DummyMetric(),
        ).eval()

        encoded_state = torch.randn(batch_size, time_steps, decoder_inout_dim)
        source_mask = torch.ones(batch_size, time_steps).bool()
        target_tokens = {
            "tokens": {
                "tokens": torch.ones(batch_size, time_steps).long()
            }
        }
        source_mask[0, 1:] = False
        encoder_out = {
            "source_mask": source_mask,
            "encoder_outputs": encoded_state
        }

        auto_regressive_seq_decoder.forward(encoder_out, target_tokens)
        assert auto_regressive_seq_decoder.get_metrics(
        )["BLEU"] == 1.388809517005903e-11
        assert auto_regressive_seq_decoder.get_metrics()["em"] == 0.0
        assert auto_regressive_seq_decoder.get_metrics()["f1"] == 1 / 3
    def test_auto_regressive_seq_decoder_forward(self):
        batch_size, time_steps, decoder_inout_dim = 2, 3, 4
        vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim)

        auto_regressive_seq_decoder = AutoRegressiveSeqDecoder(
            vocab,
            decoder_net,
            Embedding(num_embeddings=vocab.get_vocab_size(),
                      embedding_dim=decoder_inout_dim),
            beam_search=Lazy(BeamSearch,
                             constructor_extras={
                                 "max_steps": 10,
                                 "beam_size": 4
                             }),
        )

        encoded_state = torch.rand(batch_size, time_steps, decoder_inout_dim)
        source_mask = torch.ones(batch_size, time_steps).bool()
        target_tokens = {
            "tokens": {
                "tokens": torch.ones(batch_size, time_steps).long()
            }
        }
        source_mask[0, 1:] = False
        encoder_out = {
            "source_mask": source_mask,
            "encoder_outputs": encoded_state
        }

        assert auto_regressive_seq_decoder.forward(encoder_out) == {}
        loss = auto_regressive_seq_decoder.forward(encoder_out,
                                                   target_tokens)["loss"]
        assert loss.shape == torch.Size([]) and loss.requires_grad
        auto_regressive_seq_decoder.eval()
        assert "predictions" in auto_regressive_seq_decoder.forward(
            encoder_out)
Ejemplo n.º 13
0
    def from_partial_objects(
        cls,
        serialization_dir: str,
        local_rank: int,
        dataset_reader: DatasetReader,
        train_data_path: Any,
        model: Lazy[Model],
        data_loader: Lazy[DataLoader],
        trainer: Lazy[Trainer],
        vocabulary: Lazy[Vocabulary] = Lazy(Vocabulary),
        datasets_for_vocab_creation: List[str] = None,
        validation_dataset_reader: DatasetReader = None,
        validation_data_path: Any = None,
        validation_data_loader: Lazy[DataLoader] = None,
        test_data_path: Any = None,
        evaluate_on_test: bool = False,
        batch_weight_key: str = "",
        ddp_accelerator: Optional[DdpAccelerator] = None,
    ) -> "TrainModel":
        """
        This method is intended for use with our `FromParams` logic, to construct a `TrainModel`
        object from a config file passed to the `allennlp train` command.  The arguments to this
        method are the allowed top-level keys in a configuration file (except for the first three,
        which are obtained separately).

        You *could* use this outside of our `FromParams` logic if you really want to, but there
        might be easier ways to accomplish your goal than instantiating `Lazy` objects.  If you are
        writing your own training loop, we recommend that you look at the implementation of this
        method for inspiration and possibly some utility functions you can call, but you very likely
        should not use this method directly.

        The `Lazy` type annotations here are a mechanism for building dependencies to an object
        sequentially - the `TrainModel` object needs data, a model, and a trainer, but the model
        needs to see the data before it's constructed (to create a vocabulary) and the trainer needs
        the data and the model before it's constructed.  Objects that have sequential dependencies
        like this are labeled as `Lazy` in their type annotations, and we pass the missing
        dependencies when we call their `construct()` method, which you can see in the code below.

        # Parameters

        serialization_dir: `str`
            The directory where logs and model archives will be saved.

            In a typical AllenNLP configuration file, this parameter does not get an entry as a
            top-level key, it gets passed in separately.

        local_rank: `int`
            The process index that is initialized using the GPU device id.

            In a typical AllenNLP configuration file, this parameter does not get an entry as a
            top-level key, it gets passed in separately.

        dataset_reader: `DatasetReader`
            The `DatasetReader` that will be used for training and (by default) for validation.

        train_data_path: `str`
            The file (or directory) that will be passed to `dataset_reader.read()` to construct the
            training data.

        model: `Lazy[Model]`
            The model that we will train.  This is lazy because it depends on the `Vocabulary`;
            after constructing the vocabulary we call `model.construct(vocab=vocabulary)`.

        data_loader: `Lazy[DataLoader]`
            The data_loader we use to batch instances from the dataset reader at training and (by
            default) validation time. This is lazy because it takes a dataset in it's constructor.

        trainer: `Lazy[Trainer]`
            The `Trainer` that actually implements the training loop.  This is a lazy object because
            it depends on the model that's going to be trained.

        vocabulary: `Lazy[Vocabulary]`, optional (default=`Lazy(Vocabulary)`)
            The `Vocabulary` that we will use to convert strings in the data to integer ids (and
            possibly set sizes of embedding matrices in the `Model`).  By default we construct the
            vocabulary from the instances that we read.

        datasets_for_vocab_creation: `List[str]`, optional (default=`None`)
            If you pass in more than one dataset but don't want to use all of them to construct a
            vocabulary, you can pass in this key to limit it.  Valid entries in the list are
            "train", "validation" and "test".

        validation_dataset_reader: `DatasetReader`, optional (default=`None`)
            If given, we will use this dataset reader for the validation data instead of
            `dataset_reader`.

        validation_data_path: `str`, optional (default=`None`)
            If given, we will use this data for computing validation metrics and early stopping.

        validation_data_loader: `Lazy[DataLoader]`, optional (default=`None`)
            If given, the data_loader we use to batch instances from the dataset reader at
            validation and test time. This is lazy because it takes a dataset in it's constructor.

        test_data_path: `str`, optional (default=`None`)
            If given, we will use this as test data.  This makes it available for vocab creation by
            default, but nothing else.

        evaluate_on_test: `bool`, optional (default=`False`)
            If given, we will evaluate the final model on this data at the end of training.  Note
            that we do not recommend using this for actual test data in every-day experimentation;
            you should only very rarely evaluate your model on actual test data.

        batch_weight_key: `str`, optional (default=`""`)
            The name of metric used to weight the loss on a per-batch basis.  This is only used
            during evaluation on final test data, if you've specified `evaluate_on_test=True`.

        ddp_accelerator : `Optional[DdpAccelerator]`, optional (default = `None`)
            A `DdpAccelerator` to use in distributed trainer. Passed to the model and the trainer.

        """
        # Train data loader.
        data_loaders: Dict[str, DataLoader] = {
            "train":
            data_loader.construct(reader=dataset_reader,
                                  data_path=train_data_path)
        }

        # Validation data loader.
        if validation_data_path is not None:
            validation_dataset_reader = validation_dataset_reader or dataset_reader
            if validation_data_loader is not None:
                data_loaders["validation"] = validation_data_loader.construct(
                    reader=validation_dataset_reader,
                    data_path=validation_data_path)
            else:
                data_loaders["validation"] = data_loader.construct(
                    reader=validation_dataset_reader,
                    data_path=validation_data_path)
                if getattr(data_loaders["validation"], "batches_per_epoch",
                           None) is not None:
                    warnings.warn(
                        "Using 'data_loader' params to construct validation data loader since "
                        "'validation_data_loader' params not specified, but you have "
                        "'data_loader.batches_per_epoch' set which may result in different "
                        "validation datasets for each epoch.",
                        UserWarning,
                    )

        # Test data loader.
        if test_data_path is not None:
            test_dataset_reader = validation_dataset_reader or dataset_reader
            if validation_data_loader is not None:
                data_loaders["test"] = validation_data_loader.construct(
                    reader=test_dataset_reader, data_path=test_data_path)
            else:
                data_loaders["test"] = data_loader.construct(
                    reader=test_dataset_reader, data_path=test_data_path)

        if datasets_for_vocab_creation:
            for key in datasets_for_vocab_creation:
                if key not in data_loaders:
                    raise ConfigurationError(
                        f"invalid 'dataset_for_vocab_creation' {key}")

            logger.info(
                "From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation),
            )

        instance_generator = (instance
                              for key, data_loader in data_loaders.items()
                              if datasets_for_vocab_creation is None
                              or key in datasets_for_vocab_creation
                              for instance in data_loader.iter_instances())

        vocabulary_ = vocabulary.construct(instances=instance_generator)

        model_ = model.construct(vocab=vocabulary_,
                                 serialization_dir=serialization_dir,
                                 ddp_accelerator=ddp_accelerator)

        # Initializing the model can have side effect of expanding the vocabulary.
        # Save the vocab only in the primary. In the degenerate non-distributed
        # case, we're trivially the primary. In the distributed case this is safe
        # to do without worrying about race conditions since saving and loading
        # the vocab involves acquiring a file lock.
        if local_rank == 0:
            vocabulary_path = os.path.join(serialization_dir, "vocabulary")
            vocabulary_.save_to_files(vocabulary_path)

        for data_loader_ in data_loaders.values():
            data_loader_.index_with(model_.vocab)

        trainer_ = trainer.construct(
            serialization_dir=serialization_dir,
            model=model_,
            data_loader=data_loaders["train"],
            validation_data_loader=data_loaders.get("validation"),
            local_rank=local_rank,
            ddp_accelerator=ddp_accelerator,
        )
        assert trainer_ is not None

        return cls(
            serialization_dir=serialization_dir,
            model=model_,
            trainer=trainer_,
            evaluation_data_loader=data_loaders.get("test"),
            evaluate_on_test=evaluate_on_test,
            batch_weight_key=batch_weight_key,
        )
Ejemplo n.º 14
0
def test_run_steps_programmatically(step_cache_class):
    from allennlp.data.dataset_readers import SequenceTaggingDatasetReader
    from allennlp.tango.dataset import DatasetReaderAdapterStep
    from allennlp.tango import TrainingStep
    from allennlp.common import Lazy
    from allennlp.training.optimizers import AdamOptimizer
    from allennlp.tango.dataloader import BatchSizeDataLoader
    from allennlp.models import SimpleTagger
    from allennlp.tango import EvaluationStep

    dataset_step = DatasetReaderAdapterStep(
        reader=SequenceTaggingDatasetReader(),
        splits={
            "train": "test_fixtures/data/sequence_tagging.tsv",
            "validation": "test_fixtures/data/sequence_tagging.tsv",
        },
    )
    training_step = TrainingStep(
        model=Lazy(
            SimpleTagger,
            Params({
                "text_field_embedder": {
                    "token_embedders": {
                        "tokens": {
                            "type": "embedding",
                            "projection_dim": 2,
                            "pretrained_file":
                            "test_fixtures/embeddings/glove.6B.100d.sample.txt.gz",
                            "embedding_dim": 100,
                            "trainable": True,
                        }
                    }
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": 2,
                    "hidden_size": 4,
                    "num_layers": 1
                },
            }),
        ),
        dataset=dataset_step,
        data_loader=Lazy(BatchSizeDataLoader, Params({"batch_size": 2})),
        optimizer=Lazy(AdamOptimizer),
    )
    evaluation_step = EvaluationStep(dataset=dataset_step,
                                     model=training_step,
                                     step_name="evaluation")

    with TemporaryDirectory(prefix="test_run_steps_programmatically-") as d:
        if step_cache_class == DirectoryStepCache:
            cache = DirectoryStepCache(d)
        else:
            cache = step_cache_class()

        assert "random object" not in cache
        assert dataset_step not in cache
        assert training_step not in cache
        assert evaluation_step not in cache
        assert len(cache) == 0
        with pytest.raises(KeyError):
            _ = cache[evaluation_step]

        assert tango_dry_run(evaluation_step, cache) == [
            (dataset_step, False),
            (training_step, False),
            (evaluation_step, False),
        ]
        training_step.ensure_result(cache)
        assert tango_dry_run(evaluation_step, cache) == [
            (dataset_step, True),
            (training_step, True),
            (evaluation_step, False),
        ]

        assert "random object" not in cache
        assert dataset_step in cache
        assert training_step in cache
        assert evaluation_step not in cache
        assert len(cache) == 2
        with pytest.raises(KeyError):
            _ = cache[evaluation_step]
Ejemplo n.º 15
0
Archivo: t5.py Proyecto: himkt/allennlp
 def _from_config(cls, config: "PretrainedConfig", **kwargs):
     attention_kwargs = {
         "hidden_size": config.d_model,
         "key_value_proj_dim": config.d_kv,
         "num_heads": config.num_heads,
         "relative_attention_num_buckets":
         config.relative_attention_num_buckets,
         "dropout": config.dropout_rate,
     }
     layer_norm_kwargs = {
         "hidden_size": config.d_model,
         "eps": config.layer_norm_epsilon,
     }
     block_ff = Lazy(
         T5LayerFF,
         params=Params({
             "ff_proj": {
                 "type": config.feed_forward_proj,
                 "hidden_size": config.d_model,
                 "ff_size": config.d_ff,
                 "dropout": config.dropout_rate,
             },
             "layer_norm": layer_norm_kwargs,
             "dropout": config.dropout_rate,
         }),
     )
     return cls(
         encoder=Lazy(
             T5EncoderStack.basic_encoder,
             constructor_extras={
                 "num_blocks":
                 config.num_layers,
                 "block_self_attention":
                 Lazy(T5Attention, constructor_extras=attention_kwargs),
                 "final_layer_norm":
                 T5LayerNorm(**layer_norm_kwargs),
                 "block_ff":
                 block_ff,
                 "dropout":
                 config.dropout_rate,
             },
         ),
         decoder=Lazy(
             T5DecoderStack.basic_decoder,
             constructor_extras={
                 "num_blocks":
                 config.num_decoder_layers,
                 "block_self_attention":
                 Lazy(T5Attention, constructor_extras=attention_kwargs),
                 "block_cross_attention":
                 Lazy(T5Attention, constructor_extras=attention_kwargs),
                 "final_layer_norm":
                 T5LayerNorm(**layer_norm_kwargs),
                 "block_ff":
                 block_ff,
                 "dropout":
                 config.dropout_rate,
             },
         ),
         decoder_start_token_id=config.decoder_start_token_id,
         pad_token_id=config.pad_token_id,
         eos_token_id=config.eos_token_id,
         vocab_size=config.vocab_size,
         model_dim=config.d_model,
         tie_word_embeddings=kwargs.pop("tie_word_embeddings",
                                        config.tie_word_embeddings),
         **kwargs,
     )
Ejemplo n.º 16
0
    def from_partial_objects(
            cls,
            model: Model,
            serialization_dir: str,
            data_loader: DataLoader,
            validation_data_loader: DataLoader = None,
            local_rank: int = 0,
            patience: int = None,
            validation_metric: Union[str, List[str]] = "-loss",
            num_epochs: int = 20,
            cuda_device: Optional[Union[int, torch.device]] = None,
            grad_norm: float = None,
            grad_clipping: float = None,
            distributed: bool = False,
            world_size: int = 1,
            num_gradient_accumulation_steps: int = 1,
            use_amp: bool = False,
            no_grad: List[str] = None,
            optimizer: Lazy[Optimizer] = Lazy(Optimizer.default),
            learning_rate_scheduler: Lazy[LearningRateScheduler] = None,
            momentum_scheduler: Lazy[MomentumScheduler] = None,
            moving_average: Lazy[MovingAverage] = None,
            checkpointer: Lazy[Checkpointer] = Lazy(Checkpointer),
            callbacks: List[Lazy[TrainerCallback]] = None,
            enable_default_callbacks: bool = True,
            run_sanity_checks: bool = True,
    ) -> "Trainer":
        """
        This method exists so that we can have a documented method to construct this class using
        `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this
        method.

        The reason we can't just use `__init__` with `FromParams` here is because there are
        sequential dependencies to this class's arguments.  Anything that has a `Lazy[]` type
        annotation needs something from one of the non-`Lazy` arguments.  The `Optimizer` needs to
        have the parameters from the `Model` before it's constructed, and the `Schedulers` need to
        have the `Optimizer`. Because of this, the typical way we construct things `FromParams`
        doesn't work, so we use `Lazy` to allow for constructing the objects sequentially.

        If you're not using `FromParams`, you can just construct these arguments in the right order
        yourself in your code and call the constructor directly.
        """
        if cuda_device is None:
            from torch import cuda

            if cuda.device_count() > 0:
                cuda_device = 0
            else:
                cuda_device = -1

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        if no_grad:
            for name, parameter in model.named_parameters():
                if any(re.search(regex, name) for regex in no_grad):
                    parameter.requires_grad_(False)

        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer_ = optimizer.construct(model_parameters=parameters)

        common_util.log_frozen_and_tunable_parameter_names(model)

        batches_per_epoch: Optional[int]
        try:
            batches_per_epoch = len(data_loader)
            batches_per_epoch = math.ceil(batches_per_epoch / num_gradient_accumulation_steps)
        except TypeError:
            batches_per_epoch = None

        moving_average_ = (
            None if moving_average is None else moving_average.construct(parameters=parameters)
        )
        learning_rate_scheduler_ = (
            None
            if learning_rate_scheduler is None
            else learning_rate_scheduler.construct(
                optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch
            )
        )
        momentum_scheduler_ = (
            None
            if momentum_scheduler is None
            else momentum_scheduler.construct(optimizer=optimizer_)
        )
        checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir)

        callbacks_: List[TrainerCallback] = []
        for callback_ in callbacks or []:
            callbacks_.append(callback_.construct(serialization_dir=serialization_dir))

        return cls(
            model,
            optimizer_,
            data_loader,
            patience=patience,
            validation_metric=validation_metric,
            validation_data_loader=validation_data_loader,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=learning_rate_scheduler_,
            momentum_scheduler=momentum_scheduler_,
            checkpointer=checkpointer_,
            moving_average=moving_average_,
            callbacks=callbacks_,
            distributed=distributed,
            local_rank=local_rank,
            world_size=world_size,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
            use_amp=use_amp,
            enable_default_callbacks=enable_default_callbacks,
            run_sanity_checks=run_sanity_checks,
        )
Ejemplo n.º 17
0
    def __init__(
        self,
        vocab: Vocabulary,
        decoder_net: DecoderNet,
        target_embedder: Embedding,
        target_namespace: str = "tokens",
        beam_search: Lazy[BeamSearch] = Lazy(BeamSearch),
        tie_output_embedding: bool = False,
        scheduled_sampling_ratio: float = 0,
        label_smoothing_ratio: Optional[float] = None,
        tensor_based_metric: Metric = None,
        token_based_metric: Metric = None,
        **kwargs
    ) -> None:
        super().__init__(target_embedder)

        self._vocab = vocab

        # Decodes the sequence of encoded hidden states into e new sequence of hidden states.
        self._decoder_net = decoder_net
        self._target_namespace = target_namespace
        self._label_smoothing_ratio = label_smoothing_ratio

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace)
        # For backwards compatibility, check if beam_size or max_decoding_steps were passed in as
        # kwargs. If so, update the BeamSearch object before constructing and raise a DeprecationWarning
        deprecation_warning = (
            "The parameter {} has been deprecated."
            " Provide this parameter as argument to beam_search instead."
        )
        beam_search_extras = {}
        if "beam_size" in kwargs:
            beam_search_extras["beam_size"] = kwargs["beam_size"]
            warnings.warn(deprecation_warning.format("beam_size"), DeprecationWarning)
        if "max_decoding_steps" in kwargs:
            beam_search_extras["max_steps"] = kwargs["max_decoding_steps"]
            warnings.warn(deprecation_warning.format("max_decoding_steps"), DeprecationWarning)
        self._beam_search = beam_search.construct(
            end_index=self._end_index, vocab=self._vocab, **beam_search_extras
        )

        target_vocab_size = self._vocab.get_vocab_size(self._target_namespace)

        if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim:
            raise ConfigurationError(
                "Target Embedder output_dim doesn't match decoder module's input."
            )

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(
            self._decoder_net.get_output_dim(), target_vocab_size
        )

        if tie_output_embedding:
            if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape:
                raise ConfigurationError(
                    "Can't tie embeddings with output linear layer, due to shape mismatch"
                )
            self._output_projection_layer.weight = self.target_embedder.weight

        # These metrics will be updated during training and validation
        self._tensor_based_metric = tensor_based_metric
        self._token_based_metric = token_based_metric

        self._scheduled_sampling_ratio = scheduled_sampling_ratio