def from_archive( cls, archive: Archive, *, interpreter_name: Optional[str] = None, train_data_path: Optional[DatasetReaderInput] = None, train_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), test_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), params_to_freeze: Optional[List[str]] = None, cuda_device: int = -1, **extras, ) -> "InfluenceInterpreter": """ Load an `InfluenceInterpreter` from an `Archive`. The other parameters are the same as `.from_path()`. """ interpreter_cls = cls.by_name(interpreter_name or cls.default_implementation) return interpreter_cls( model=archive.model, train_data_path=train_data_path or archive.config["train_data_path"], train_dataset_reader=archive.dataset_reader, test_dataset_reader=archive.validation_dataset_reader, train_data_loader=train_data_loader, test_data_loader=test_data_loader, params_to_freeze=params_to_freeze, cuda_device=cuda_device, **extras, )
def basic_encoder( cls, token_embeddings: nn.Embedding, num_blocks: int = 6, block_self_attention: Lazy[T5Attention] = Lazy(T5Attention), final_layer_norm: Optional[T5LayerNorm] = None, block_ff: Lazy[T5LayerFF] = Lazy(T5LayerFF), dropout: float = 0.1, ddp_accelerator: Optional[DdpAccelerator] = None, checkpoint_wrapper: Optional[CheckpointWrapper] = None, ) -> "T5EncoderStack": if ddp_accelerator is not None: logger.info("Initializing T5 encoder with DdpAccelerator %s", ddp_accelerator) blocks: List[T5Block] = [] for i in range(num_blocks): block = T5Block( attention=T5LayerSelfAttention( self_attention=block_self_attention.construct( is_decoder=False, has_relative_attention_bias=( i == 0))), cross_attention=None, ff=block_ff.construct(), ) if checkpoint_wrapper is not None: block = checkpoint_wrapper.wrap_module(block) if ddp_accelerator is not None: block = ddp_accelerator.wrap_module(block) blocks.append(block) return cls(token_embeddings, blocks, final_layer_norm=final_layer_norm, dropout=dropout)
def from_path( cls, archive_path: Union[str, PathLike], *, interpreter_name: Optional[str] = None, train_data_path: Optional[DatasetReaderInput] = None, train_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), test_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), params_to_freeze: Optional[List[str]] = None, cuda_device: int = -1, import_plugins: bool = True, overrides: Union[str, Dict[str, Any]] = "", **extras, ) -> "InfluenceInterpreter": """ Load an `InfluenceInterpreter` from an archive path. # Parameters archive_path : `Union[str, PathLike]`, required The path to the archive file. interpreter_name : `Optional[str]`, optional (default = `None`) The registered name of the an interpreter class. If not specified, the default implementation (`SimpleInfluence`) will be used. train_data_path : `Optional[DatasetReaderInput]`, optional (default = `None`) If not specified, `train_data_path` will be taken from the archive's config. train_data_loader : `Lazy[DataLoader]`, optional (default = `Lazy(SimpleDataLoader)`) test_data_loader : `Lazy[DataLoader]`, optional (default = `Lazy(SimpleDataLoader)`) params_to_freeze : `Optional[List[str]]`, optional (default = `None`) cuda_device : `int`, optional (default = `-1`) import_plugins : `bool`, optional (default = `True`) If `True`, we attempt to import plugins before loading the `InfluenceInterpreter`. This comes with additional overhead, but means you don't need to explicitly import the modules that your implementation depends on as long as those modules can be found by `allennlp.common.plugins.import_plugins()`. overrides : `Union[str, Dict[str, Any]]`, optional (default = `""`) JSON overrides to apply to the unarchived `Params` object. **extras : `Any` Extra parameters to pass to the interpreter's `__init__()` method. """ if import_plugins: plugins.import_plugins() return cls.from_archive( load_archive(archive_path, cuda_device=cuda_device, overrides=overrides), interpreter_name=interpreter_name, train_data_path=train_data_path, train_data_loader=train_data_loader, test_data_loader=test_data_loader, params_to_freeze=params_to_freeze, cuda_device=cuda_device, **extras, )
def test_from_params_in_trainer(self): # This is more of an integration test, making sure that a bunch of pieces fit together # correctly, but it matters most for this learning rate scheduler, so we're testing it here. params = Params({ "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, }) # The method called in the logic below only checks the length of this list, not its # contents, so this should be safe. instances = [ 1, ] * 40 # noqa: E231, flake doesn't like what black does with this list optim = self._get_optimizer() trainer = TrainerBase.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, iterator=BasicIterator(batch_size=10), train_data=instances, ) assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular) # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and # that num_steps_per_epoch is computed and passed correctly. This logic happens inside of # `Trainer.from_partial_objects`. assert trainer._learning_rate_scheduler.num_epochs == 5 assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4 # And we'll do one more to make sure that we can override num_epochs in the scheduler if we # really want to. Not sure why you would ever want to in this case; this is just testing # the functionality. params = Params({ "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "num_epochs": 3, "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, }) trainer = TrainerBase.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, iterator=BasicIterator(batch_size=10), train_data=instances, ) assert trainer._learning_rate_scheduler.num_epochs == 3
def __init__( self, lazy1: Lazy[ConstructedObject], lazy2: Lazy[ConstructedObject] = Lazy(ConstructedObject), lazy3: Lazy[ConstructedObject] = None, lazy4: Optional[Lazy[ConstructedObject]] = Lazy(ConstructedObject), ) -> None: self.lazy1 = lazy1.construct() self.lazy2 = lazy2.construct(a=2) self.lazy3 = None if lazy3 is None else lazy3.construct() self.lazy4 = None if lazy4 is None else lazy4.construct(a=1)
def __init__( self, token_embeddings: Optional[nn.Embedding] = None, encoder: Lazy[T5EncoderStack] = Lazy(T5EncoderStack), decoder: Lazy[T5DecoderStack] = Lazy(T5DecoderStack), decoder_start_token_id: int = 0, pad_token_id: int = 0, # These are both 0 in t5-(small|base|large). Go figure. eos_token_id: int = 1, vocab_size: int = 32128, model_dim: int = 512, output_attentions: bool = False, output_all_hidden_states: bool = False, beam_search: Lazy[BeamSearch] = Lazy(BeamSearch, beam_size=3, max_steps=100), ddp_accelerator: Optional[DdpAccelerator] = None, checkpoint_wrapper: Optional[CheckpointWrapper] = None, tie_word_embeddings: bool = True, ): super().__init__() self._tie_word_embeddings = tie_word_embeddings self.model_dim = model_dim self.token_embeddings = token_embeddings or nn.Embedding( vocab_size, model_dim) if token_embeddings is None: self.token_embeddings.weight.data.normal_(mean=0.0, std=1.0) self.encoder: T5EncoderStack = encoder.construct( token_embeddings=self.token_embeddings, ddp_accelerator=ddp_accelerator, checkpoint_wrapper=checkpoint_wrapper, ) self.decoder: T5DecoderStack = decoder.construct( token_embeddings=self.token_embeddings, ddp_accelerator=ddp_accelerator, checkpoint_wrapper=checkpoint_wrapper, ) self.lm_head = nn.Linear(self.decoder.hidden_size, self.token_embeddings.num_embeddings, bias=False) if self._tie_word_embeddings: self.lm_head.weight = self.token_embeddings.weight self.loss_fct = CrossEntropyLoss(ignore_index=-100) self.decoder_start_token_id = decoder_start_token_id self.pad_token_id = pad_token_id self.eos_token_id = eos_token_id self.output_attentions = output_attentions self.output_all_hidden_states = output_all_hidden_states self.beam_search = beam_search.construct(end_index=self.eos_token_id)
def __init__( self, model: Model, train_data_path: DatasetReaderInput, train_dataset_reader: DatasetReader, *, test_dataset_reader: Optional[DatasetReader] = None, train_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), test_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), params_to_freeze: List[str] = None, cuda_device: int = -1, lissa_batch_size: int = 8, damping: float = 3e-3, num_samples: int = 1, recursion_depth: Union[float, int] = 0.25, scale: float = 1e4, ) -> None: super().__init__( model=model, train_data_path=train_data_path, train_dataset_reader=train_dataset_reader, test_dataset_reader=test_dataset_reader, train_data_loader=train_data_loader, test_data_loader=test_data_loader, params_to_freeze=params_to_freeze, cuda_device=cuda_device, ) self._lissa_dataloader = SimpleDataLoader( list(self._train_loader.iter_instances()), lissa_batch_size, shuffle=True, vocab=self.vocab, ) self._lissa_dataloader.set_target_device(self.device) if isinstance(recursion_depth, float) and recursion_depth > 0.0: self._lissa_dataloader.batches_per_epoch = int( len(self._lissa_dataloader) * recursion_depth) elif isinstance(recursion_depth, int) and recursion_depth > 0: self._lissa_dataloader.batches_per_epoch = recursion_depth else: raise ValueError( "'recursion_depth' should be a positive int or float") self._damping = damping self._num_samples = num_samples self._recursion_depth = recursion_depth self._scale = scale
def __init__( self, model: Model, train_data_path: DatasetReaderInput, train_dataset_reader: DatasetReader, *, test_dataset_reader: Optional[DatasetReader] = None, train_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), test_data_loader: Lazy[DataLoader] = Lazy( SimpleDataLoader.from_dataset_reader), params_to_freeze: Optional[List[str]] = None, cuda_device: int = -1, ) -> None: self.model = model self.vocab = model.vocab self.device = int_to_device(cuda_device) self._train_data_path = train_data_path self._train_loader = train_data_loader.construct( reader=train_dataset_reader, data_path=train_data_path, batch_size=1, ) self._train_loader.set_target_device(self.device) self._train_loader.index_with(self.vocab) self._test_dataset_reader = test_dataset_reader or train_dataset_reader self._lazy_test_data_loader = test_data_loader self.model.to(self.device) if params_to_freeze is not None: for name, param in self.model.named_parameters(): if any( [re.match(pattern, name) for pattern in params_to_freeze]): param.requires_grad = False # These variables are set when the corresponding public properties are accessed. # This is not set until we actually run the calculation since some parameters might not be used. self._used_params: Optional[List[torch.nn.Parameter]] = None self._used_param_names: Optional[List[str]] = None self._train_instances: Optional[List[InstanceWithGrads]] = None
def test_auto_regressive_seq_decoder_init(self): decoder_inout_dim = 4 vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim) AutoRegressiveSeqDecoder( vocab, decoder_net, Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=decoder_inout_dim), beam_search=Lazy(BeamSearch, constructor_extras={"max_steps": 10}), ) with pytest.raises(ConfigurationError): AutoRegressiveSeqDecoder( vocab, decoder_net, Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=decoder_inout_dim + 1), beam_search=Lazy(BeamSearch, constructor_extras={"max_steps": 10}), )
def test_auto_regressive_seq_decoder_indices_to_tokens(self): decoder_inout_dim = 4 vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim) auto_regressive_seq_decoder = AutoRegressiveSeqDecoder( vocab, decoder_net, Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=decoder_inout_dim), beam_search=Lazy(BeamSearch, constructor_extras={"max_steps": 10}), ) predictions = torch.tensor([[3, 2, 5, 0, 0], [2, 2, 3, 5, 0]]) tokens_ground_truth = [["B", "A"], ["A", "A", "B"]] predicted_tokens = auto_regressive_seq_decoder.indices_to_tokens( predictions.numpy()) assert predicted_tokens == tokens_ground_truth
def test_auto_regressive_seq_decoder_tensor_and_token_based_metric(self): # set all seeds to a fixed value (torch, numpy, etc.). # this enable a deterministic behavior of the `auto_regressive_seq_decoder` # below (i.e., parameter initialization and `encoded_state = torch.randn(..)`) prepare_environment(Params({})) batch_size, time_steps, decoder_inout_dim = 2, 3, 4 vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim) auto_regressive_seq_decoder = AutoRegressiveSeqDecoder( vocab, decoder_net, Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=decoder_inout_dim), beam_search=Lazy(BeamSearch, constructor_extras={ "max_steps": 10, "beam_size": 4 }), tensor_based_metric=BLEU(), token_based_metric=DummyMetric(), ).eval() encoded_state = torch.randn(batch_size, time_steps, decoder_inout_dim) source_mask = torch.ones(batch_size, time_steps).bool() target_tokens = { "tokens": { "tokens": torch.ones(batch_size, time_steps).long() } } source_mask[0, 1:] = False encoder_out = { "source_mask": source_mask, "encoder_outputs": encoded_state } auto_regressive_seq_decoder.forward(encoder_out, target_tokens) assert auto_regressive_seq_decoder.get_metrics( )["BLEU"] == 1.388809517005903e-11 assert auto_regressive_seq_decoder.get_metrics()["em"] == 0.0 assert auto_regressive_seq_decoder.get_metrics()["f1"] == 1 / 3
def test_auto_regressive_seq_decoder_forward(self): batch_size, time_steps, decoder_inout_dim = 2, 3, 4 vocab, decoder_net = create_vocab_and_decoder_net(decoder_inout_dim) auto_regressive_seq_decoder = AutoRegressiveSeqDecoder( vocab, decoder_net, Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=decoder_inout_dim), beam_search=Lazy(BeamSearch, constructor_extras={ "max_steps": 10, "beam_size": 4 }), ) encoded_state = torch.rand(batch_size, time_steps, decoder_inout_dim) source_mask = torch.ones(batch_size, time_steps).bool() target_tokens = { "tokens": { "tokens": torch.ones(batch_size, time_steps).long() } } source_mask[0, 1:] = False encoder_out = { "source_mask": source_mask, "encoder_outputs": encoded_state } assert auto_regressive_seq_decoder.forward(encoder_out) == {} loss = auto_regressive_seq_decoder.forward(encoder_out, target_tokens)["loss"] assert loss.shape == torch.Size([]) and loss.requires_grad auto_regressive_seq_decoder.eval() assert "predictions" in auto_regressive_seq_decoder.forward( encoder_out)
def from_partial_objects( cls, serialization_dir: str, local_rank: int, dataset_reader: DatasetReader, train_data_path: Any, model: Lazy[Model], data_loader: Lazy[DataLoader], trainer: Lazy[Trainer], vocabulary: Lazy[Vocabulary] = Lazy(Vocabulary), datasets_for_vocab_creation: List[str] = None, validation_dataset_reader: DatasetReader = None, validation_data_path: Any = None, validation_data_loader: Lazy[DataLoader] = None, test_data_path: Any = None, evaluate_on_test: bool = False, batch_weight_key: str = "", ddp_accelerator: Optional[DdpAccelerator] = None, ) -> "TrainModel": """ This method is intended for use with our `FromParams` logic, to construct a `TrainModel` object from a config file passed to the `allennlp train` command. The arguments to this method are the allowed top-level keys in a configuration file (except for the first three, which are obtained separately). You *could* use this outside of our `FromParams` logic if you really want to, but there might be easier ways to accomplish your goal than instantiating `Lazy` objects. If you are writing your own training loop, we recommend that you look at the implementation of this method for inspiration and possibly some utility functions you can call, but you very likely should not use this method directly. The `Lazy` type annotations here are a mechanism for building dependencies to an object sequentially - the `TrainModel` object needs data, a model, and a trainer, but the model needs to see the data before it's constructed (to create a vocabulary) and the trainer needs the data and the model before it's constructed. Objects that have sequential dependencies like this are labeled as `Lazy` in their type annotations, and we pass the missing dependencies when we call their `construct()` method, which you can see in the code below. # Parameters serialization_dir: `str` The directory where logs and model archives will be saved. In a typical AllenNLP configuration file, this parameter does not get an entry as a top-level key, it gets passed in separately. local_rank: `int` The process index that is initialized using the GPU device id. In a typical AllenNLP configuration file, this parameter does not get an entry as a top-level key, it gets passed in separately. dataset_reader: `DatasetReader` The `DatasetReader` that will be used for training and (by default) for validation. train_data_path: `str` The file (or directory) that will be passed to `dataset_reader.read()` to construct the training data. model: `Lazy[Model]` The model that we will train. This is lazy because it depends on the `Vocabulary`; after constructing the vocabulary we call `model.construct(vocab=vocabulary)`. data_loader: `Lazy[DataLoader]` The data_loader we use to batch instances from the dataset reader at training and (by default) validation time. This is lazy because it takes a dataset in it's constructor. trainer: `Lazy[Trainer]` The `Trainer` that actually implements the training loop. This is a lazy object because it depends on the model that's going to be trained. vocabulary: `Lazy[Vocabulary]`, optional (default=`Lazy(Vocabulary)`) The `Vocabulary` that we will use to convert strings in the data to integer ids (and possibly set sizes of embedding matrices in the `Model`). By default we construct the vocabulary from the instances that we read. datasets_for_vocab_creation: `List[str]`, optional (default=`None`) If you pass in more than one dataset but don't want to use all of them to construct a vocabulary, you can pass in this key to limit it. Valid entries in the list are "train", "validation" and "test". validation_dataset_reader: `DatasetReader`, optional (default=`None`) If given, we will use this dataset reader for the validation data instead of `dataset_reader`. validation_data_path: `str`, optional (default=`None`) If given, we will use this data for computing validation metrics and early stopping. validation_data_loader: `Lazy[DataLoader]`, optional (default=`None`) If given, the data_loader we use to batch instances from the dataset reader at validation and test time. This is lazy because it takes a dataset in it's constructor. test_data_path: `str`, optional (default=`None`) If given, we will use this as test data. This makes it available for vocab creation by default, but nothing else. evaluate_on_test: `bool`, optional (default=`False`) If given, we will evaluate the final model on this data at the end of training. Note that we do not recommend using this for actual test data in every-day experimentation; you should only very rarely evaluate your model on actual test data. batch_weight_key: `str`, optional (default=`""`) The name of metric used to weight the loss on a per-batch basis. This is only used during evaluation on final test data, if you've specified `evaluate_on_test=True`. ddp_accelerator : `Optional[DdpAccelerator]`, optional (default = `None`) A `DdpAccelerator` to use in distributed trainer. Passed to the model and the trainer. """ # Train data loader. data_loaders: Dict[str, DataLoader] = { "train": data_loader.construct(reader=dataset_reader, data_path=train_data_path) } # Validation data loader. if validation_data_path is not None: validation_dataset_reader = validation_dataset_reader or dataset_reader if validation_data_loader is not None: data_loaders["validation"] = validation_data_loader.construct( reader=validation_dataset_reader, data_path=validation_data_path) else: data_loaders["validation"] = data_loader.construct( reader=validation_dataset_reader, data_path=validation_data_path) if getattr(data_loaders["validation"], "batches_per_epoch", None) is not None: warnings.warn( "Using 'data_loader' params to construct validation data loader since " "'validation_data_loader' params not specified, but you have " "'data_loader.batches_per_epoch' set which may result in different " "validation datasets for each epoch.", UserWarning, ) # Test data loader. if test_data_path is not None: test_dataset_reader = validation_dataset_reader or dataset_reader if validation_data_loader is not None: data_loaders["test"] = validation_data_loader.construct( reader=test_dataset_reader, data_path=test_data_path) else: data_loaders["test"] = data_loader.construct( reader=test_dataset_reader, data_path=test_data_path) if datasets_for_vocab_creation: for key in datasets_for_vocab_creation: if key not in data_loaders: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {key}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) instance_generator = (instance for key, data_loader in data_loaders.items() if datasets_for_vocab_creation is None or key in datasets_for_vocab_creation for instance in data_loader.iter_instances()) vocabulary_ = vocabulary.construct(instances=instance_generator) model_ = model.construct(vocab=vocabulary_, serialization_dir=serialization_dir, ddp_accelerator=ddp_accelerator) # Initializing the model can have side effect of expanding the vocabulary. # Save the vocab only in the primary. In the degenerate non-distributed # case, we're trivially the primary. In the distributed case this is safe # to do without worrying about race conditions since saving and loading # the vocab involves acquiring a file lock. if local_rank == 0: vocabulary_path = os.path.join(serialization_dir, "vocabulary") vocabulary_.save_to_files(vocabulary_path) for data_loader_ in data_loaders.values(): data_loader_.index_with(model_.vocab) trainer_ = trainer.construct( serialization_dir=serialization_dir, model=model_, data_loader=data_loaders["train"], validation_data_loader=data_loaders.get("validation"), local_rank=local_rank, ddp_accelerator=ddp_accelerator, ) assert trainer_ is not None return cls( serialization_dir=serialization_dir, model=model_, trainer=trainer_, evaluation_data_loader=data_loaders.get("test"), evaluate_on_test=evaluate_on_test, batch_weight_key=batch_weight_key, )
def test_run_steps_programmatically(step_cache_class): from allennlp.data.dataset_readers import SequenceTaggingDatasetReader from allennlp.tango.dataset import DatasetReaderAdapterStep from allennlp.tango import TrainingStep from allennlp.common import Lazy from allennlp.training.optimizers import AdamOptimizer from allennlp.tango.dataloader import BatchSizeDataLoader from allennlp.models import SimpleTagger from allennlp.tango import EvaluationStep dataset_step = DatasetReaderAdapterStep( reader=SequenceTaggingDatasetReader(), splits={ "train": "test_fixtures/data/sequence_tagging.tsv", "validation": "test_fixtures/data/sequence_tagging.tsv", }, ) training_step = TrainingStep( model=Lazy( SimpleTagger, Params({ "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "projection_dim": 2, "pretrained_file": "test_fixtures/embeddings/glove.6B.100d.sample.txt.gz", "embedding_dim": 100, "trainable": True, } } }, "encoder": { "type": "lstm", "input_size": 2, "hidden_size": 4, "num_layers": 1 }, }), ), dataset=dataset_step, data_loader=Lazy(BatchSizeDataLoader, Params({"batch_size": 2})), optimizer=Lazy(AdamOptimizer), ) evaluation_step = EvaluationStep(dataset=dataset_step, model=training_step, step_name="evaluation") with TemporaryDirectory(prefix="test_run_steps_programmatically-") as d: if step_cache_class == DirectoryStepCache: cache = DirectoryStepCache(d) else: cache = step_cache_class() assert "random object" not in cache assert dataset_step not in cache assert training_step not in cache assert evaluation_step not in cache assert len(cache) == 0 with pytest.raises(KeyError): _ = cache[evaluation_step] assert tango_dry_run(evaluation_step, cache) == [ (dataset_step, False), (training_step, False), (evaluation_step, False), ] training_step.ensure_result(cache) assert tango_dry_run(evaluation_step, cache) == [ (dataset_step, True), (training_step, True), (evaluation_step, False), ] assert "random object" not in cache assert dataset_step in cache assert training_step in cache assert evaluation_step not in cache assert len(cache) == 2 with pytest.raises(KeyError): _ = cache[evaluation_step]
def _from_config(cls, config: "PretrainedConfig", **kwargs): attention_kwargs = { "hidden_size": config.d_model, "key_value_proj_dim": config.d_kv, "num_heads": config.num_heads, "relative_attention_num_buckets": config.relative_attention_num_buckets, "dropout": config.dropout_rate, } layer_norm_kwargs = { "hidden_size": config.d_model, "eps": config.layer_norm_epsilon, } block_ff = Lazy( T5LayerFF, params=Params({ "ff_proj": { "type": config.feed_forward_proj, "hidden_size": config.d_model, "ff_size": config.d_ff, "dropout": config.dropout_rate, }, "layer_norm": layer_norm_kwargs, "dropout": config.dropout_rate, }), ) return cls( encoder=Lazy( T5EncoderStack.basic_encoder, constructor_extras={ "num_blocks": config.num_layers, "block_self_attention": Lazy(T5Attention, constructor_extras=attention_kwargs), "final_layer_norm": T5LayerNorm(**layer_norm_kwargs), "block_ff": block_ff, "dropout": config.dropout_rate, }, ), decoder=Lazy( T5DecoderStack.basic_decoder, constructor_extras={ "num_blocks": config.num_decoder_layers, "block_self_attention": Lazy(T5Attention, constructor_extras=attention_kwargs), "block_cross_attention": Lazy(T5Attention, constructor_extras=attention_kwargs), "final_layer_norm": T5LayerNorm(**layer_norm_kwargs), "block_ff": block_ff, "dropout": config.dropout_rate, }, ), decoder_start_token_id=config.decoder_start_token_id, pad_token_id=config.pad_token_id, eos_token_id=config.eos_token_id, vocab_size=config.vocab_size, model_dim=config.d_model, tie_word_embeddings=kwargs.pop("tie_word_embeddings", config.tie_word_embeddings), **kwargs, )
def from_partial_objects( cls, model: Model, serialization_dir: str, data_loader: DataLoader, validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, validation_metric: Union[str, List[str]] = "-loss", num_epochs: int = 20, cuda_device: Optional[Union[int, torch.device]] = None, grad_norm: float = None, grad_clipping: float = None, distributed: bool = False, world_size: int = 1, num_gradient_accumulation_steps: int = 1, use_amp: bool = False, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = Lazy(Optimizer.default), learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = Lazy(Checkpointer), callbacks: List[Lazy[TrainerCallback]] = None, enable_default_callbacks: bool = True, run_sanity_checks: bool = True, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ if cuda_device is None: from torch import cuda if cuda.device_count() > 0: cuda_device = 0 else: cuda_device = -1 check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) common_util.log_frozen_and_tunable_parameter_names(model) batches_per_epoch: Optional[int] try: batches_per_epoch = len(data_loader) batches_per_epoch = math.ceil(batches_per_epoch / num_gradient_accumulation_steps) except TypeError: batches_per_epoch = None moving_average_ = ( None if moving_average is None else moving_average.construct(parameters=parameters) ) learning_rate_scheduler_ = ( None if learning_rate_scheduler is None else learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch ) ) momentum_scheduler_ = ( None if momentum_scheduler is None else momentum_scheduler.construct(optimizer=optimizer_) ) checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir) callbacks_: List[TrainerCallback] = [] for callback_ in callbacks or []: callbacks_.append(callback_.construct(serialization_dir=serialization_dir)) return cls( model, optimizer_, data_loader, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, checkpointer=checkpointer_, moving_average=moving_average_, callbacks=callbacks_, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, use_amp=use_amp, enable_default_callbacks=enable_default_callbacks, run_sanity_checks=run_sanity_checks, )
def __init__( self, vocab: Vocabulary, decoder_net: DecoderNet, target_embedder: Embedding, target_namespace: str = "tokens", beam_search: Lazy[BeamSearch] = Lazy(BeamSearch), tie_output_embedding: bool = False, scheduled_sampling_ratio: float = 0, label_smoothing_ratio: Optional[float] = None, tensor_based_metric: Metric = None, token_based_metric: Metric = None, **kwargs ) -> None: super().__init__(target_embedder) self._vocab = vocab # Decodes the sequence of encoded hidden states into e new sequence of hidden states. self._decoder_net = decoder_net self._target_namespace = target_namespace self._label_smoothing_ratio = label_smoothing_ratio # At prediction time, we use a beam search to find the most likely sequence of target tokens. # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace) # For backwards compatibility, check if beam_size or max_decoding_steps were passed in as # kwargs. If so, update the BeamSearch object before constructing and raise a DeprecationWarning deprecation_warning = ( "The parameter {} has been deprecated." " Provide this parameter as argument to beam_search instead." ) beam_search_extras = {} if "beam_size" in kwargs: beam_search_extras["beam_size"] = kwargs["beam_size"] warnings.warn(deprecation_warning.format("beam_size"), DeprecationWarning) if "max_decoding_steps" in kwargs: beam_search_extras["max_steps"] = kwargs["max_decoding_steps"] warnings.warn(deprecation_warning.format("max_decoding_steps"), DeprecationWarning) self._beam_search = beam_search.construct( end_index=self._end_index, vocab=self._vocab, **beam_search_extras ) target_vocab_size = self._vocab.get_vocab_size(self._target_namespace) if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim: raise ConfigurationError( "Target Embedder output_dim doesn't match decoder module's input." ) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear( self._decoder_net.get_output_dim(), target_vocab_size ) if tie_output_embedding: if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape: raise ConfigurationError( "Can't tie embeddings with output linear layer, due to shape mismatch" ) self._output_projection_layer.weight = self.target_embedder.weight # These metrics will be updated during training and validation self._tensor_based_metric = tensor_based_metric self._token_based_metric = token_based_metric self._scheduled_sampling_ratio = scheduled_sampling_ratio