Ejemplo n.º 1
0
def build_data_loaders(
    train_data: List[Instance],
    dev_data: List[Instance],
) -> Tuple[DataLoader, DataLoader]:
    train_loader = SimpleDataLoader(train_data, 8, shuffle=True)
    dev_loader = SimpleDataLoader(dev_data, 8, shuffle=False)
    return train_loader, dev_loader
Ejemplo n.º 2
0
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)

        bert_token_indexers = PretrainedTransformerIndexer(
            model_name=self.config.model_name)
        bert_tokenizer = PretrainedTransformerTokenizer(
            model_name=self.config.model_name)
        reader = TextClassificationJsonReader(
            token_indexers={"tokens": bert_token_indexers},
            tokenizer=bert_tokenizer)

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances,
                                                  self.config.batch_size,
                                                  shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances,
                                                self.config.batch_size,
                                                shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)

        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()
Ejemplo n.º 3
0
def build_data_loaders(train_data: List[Instance], dev_data: List[Instance],
                       batch_size: int) -> Tuple[DataLoader, DataLoader]:
    """
    Creates data loaders which loads data in batches of size batch_size for training and validation
    Adapted from https://guide.allennlp.org/training-and-prediction
    """
    train_loader = SimpleDataLoader(train_data, batch_size, shuffle=True)
    dev_loader = SimpleDataLoader(dev_data, batch_size, shuffle=False)

    return train_loader, dev_loader
    def test_from_params_in_trainer(self):
        # This is more of an integration test, making sure that a bunch of pieces fit together
        # correctly, but it matters most for this learning rate scheduler, so we're testing it here.
        params = Params({
            "num_epochs": 5,
            "learning_rate_scheduler": {
                "type": "slanted_triangular",
                "gradual_unfreezing": True,
                "discriminative_fine_tuning": True,
                "decay_factor": 0.5,
            },
        })
        # The method called in the logic below only checks the length of this list, not its
        # contents, so this should be safe.
        instances = [1] * 40
        optim = self._get_optimizer()
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=SimpleDataLoader(instances, batch_size=10),
        )
        assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular)

        # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and
        # that num_steps_per_epoch is computed and passed correctly.  This logic happens inside of
        # `Trainer.from_partial_objects`.
        assert trainer._learning_rate_scheduler.num_epochs == 5
        assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4

        # And we'll do one more to make sure that we can override num_epochs in the scheduler if we
        # really want to.  Not sure why you would ever want to in this case; this is just testing
        # the functionality.
        params = Params({
            "num_epochs": 5,
            "learning_rate_scheduler": {
                "type": "slanted_triangular",
                "num_epochs": 3,
                "gradual_unfreezing": True,
                "discriminative_fine_tuning": True,
                "decay_factor": 0.5,
            },
        })
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=SimpleDataLoader(instances, batch_size=10),
        )
        assert trainer._learning_rate_scheduler.num_epochs == 3
Ejemplo n.º 5
0
    def __init__(
        self,
        model: Model,
        train_data_path: DatasetReaderInput,
        train_dataset_reader: DatasetReader,
        *,
        test_dataset_reader: Optional[DatasetReader] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: List[str] = None,
        cuda_device: int = -1,
        lissa_batch_size: int = 8,
        damping: float = 3e-3,
        num_samples: int = 1,
        recursion_depth: Union[float, int] = 0.25,
        scale: float = 1e4,
    ) -> None:
        super().__init__(
            model=model,
            train_data_path=train_data_path,
            train_dataset_reader=train_dataset_reader,
            test_dataset_reader=test_dataset_reader,
            train_data_loader=train_data_loader,
            test_data_loader=test_data_loader,
            params_to_freeze=params_to_freeze,
            cuda_device=cuda_device,
        )

        self._lissa_dataloader = SimpleDataLoader(
            list(self._train_loader.iter_instances()),
            lissa_batch_size,
            shuffle=True,
            vocab=self.vocab,
        )
        self._lissa_dataloader.set_target_device(self.device)
        if isinstance(recursion_depth, float) and recursion_depth > 0.0:
            self._lissa_dataloader.batches_per_epoch = int(
                len(self._lissa_dataloader) * recursion_depth)
        elif isinstance(recursion_depth, int) and recursion_depth > 0:
            self._lissa_dataloader.batches_per_epoch = recursion_depth
        else:
            raise ValueError(
                "'recursion_depth' should be a positive int or float")

        self._damping = damping
        self._num_samples = num_samples
        self._recursion_depth = recursion_depth
        self._scale = scale
Ejemplo n.º 6
0
 def test_can_optimise_model_with_dense_and_sparse_params(self):
     optimizer_params = Params({"type": "dense_sparse_adam"})
     parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
     optimizer = Optimizer.from_params(model_parameters=parameters, params=optimizer_params)
     for instance in self.instances:
         instance.index_fields(self.vocab)
     GradientDescentTrainer(self.model, optimizer, SimpleDataLoader(self.instances, 2)).train()
Ejemplo n.º 7
0
    def test_console_log_callback(self):
        total_instances = 1000
        batch_size = 25

        reader = FakeDatasetReader(total_instances, batch_size)
        data_loader = SimpleDataLoader.from_dataset_reader(
            reader, "fake_path", batch_size=batch_size)
        instances = list(data_loader.iter_instances())
        vocab = Vocabulary.from_instances(instances)
        data_loader.index_with(vocab)
        model = FakeModel(vocab)
        optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9)

        trainer = GradientDescentTrainer(
            model,
            optimizer,
            data_loader,
            num_epochs=3,
            serialization_dir=self.TEST_DIR,
            callbacks=[
                ConsoleLoggerCallback.from_params(
                    Params({"should_log_inputs": True}),
                    serialization_dir=self.TEST_DIR,
                )
            ],
        )
        trainer.train()
Ejemplo n.º 8
0
    def test_trainer_can_log_batch_inputs(self):
        total_instances = 1000
        batch_size = 25

        reader = FakeDatasetReader(total_instances, batch_size)
        data_loader = SimpleDataLoader.from_dataset_reader(
            reader, "fake_path", batch_size=batch_size)
        instances = list(data_loader.iter_instances())
        vocab = Vocabulary.from_instances(instances)
        data_loader.index_with(vocab)
        model = FakeModel(vocab)
        optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9)

        trainer = GradientDescentTrainer(
            model,
            optimizer,
            data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            callbacks=[
                TensorBoardCallback(
                    serialization_dir=self.TEST_DIR,
                    distribution_interval=2,
                )
            ],
        )
        trainer.train()
Ejemplo n.º 9
0
def test_get_inverse_hvp_lissa():
    vs = [torch.tensor([1.0, 1.0])]
    # create a fake model
    vocab = Vocabulary()
    params = torch.tensor([1, 2]).float()
    model = DummyBilinearModelForTestingIF(vocab, params)
    used_params = list(model.parameters())

    # create a fake instance: just a matrix
    A = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
    fake_instance = Instance({"tensors": TensorField(A)})

    # wrap fake instance into dataloader
    lissa_data_loader = SimpleDataLoader([fake_instance],
                                         batch_size=1,
                                         batches_per_epoch=1)

    inverse_hvp = get_inverse_hvp_lissa(
        vs=vs,
        model=model,
        used_params=used_params,
        lissa_data_loader=lissa_data_loader,
        damping=0.0,
        num_samples=1,
        scale=1.0,
    )
    # I tried to increase recursion depth to actually approx the inverse Hessian vector product,
    # but I suspect due to extremely small number of data point, the algorithm doesn't work well
    # on this toy example
    ans = torch.tensor([-1.5, -4.5])
    assert torch.equal(inverse_hvp, ans)
Ejemplo n.º 10
0
 def test_batch_of_entirely_empty_lists_works(self):
     instances = [self.empty_instance, self.empty_instance]
     model = DummyModel(self.vocab)
     model.eval()
     loader = SimpleDataLoader(instances, 2, vocab=self.vocab)
     batch = next(iter(loader))
     model.forward(**batch)
def build_data_loaders(
        config, train_data: List[Instance], dev_data: List[Instance],
        test_data: List[Instance]
) -> Tuple[DataLoader, DataLoader, DataLoader]:

    train_loader = SimpleDataLoader(train_data,
                                    config.batch_size_for_train,
                                    shuffle=True)
    dev_loader = SimpleDataLoader(dev_data,
                                  config.batch_size_for_eval,
                                  shuffle=False)
    test_loader = SimpleDataLoader(test_data,
                                   config.batch_size_for_eval,
                                   shuffle=False)

    return train_loader, dev_loader, test_loader
Ejemplo n.º 12
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()
        # Now finally we can iterate through batches.
        loader = SimpleDataLoader(instances, 3)
        loader.index_with(vocab)
        for i, batch in enumerate(loader):
            lm_embeddings = elmo_bilm(
                batch["elmo"]["character_ids"]["elmo_tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            assert lengths.tolist() == expected_lengths

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                assert numpy.allclose(
                    top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                    expected_top_layer[k],
                    atol=1.0e-6,
                )
Ejemplo n.º 13
0
    def test_trainer_saves_models_at_specified_interval(self):
        data_loader = SimpleDataLoader(self.instances, 4)

        trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            checkpointer=Checkpointer(
                serialization_dir=self.TEST_DIR,
                model_save_interval=0.0001,
                num_serialized_models_to_keep=10,
            ),
        )

        trainer.train()

        # Now check the serialized files for models saved during the epoch.
        prefix = "model_state_epoch_*"
        file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix)))
        epochs = [
            re.search(r"_([0-9\.\-]+)\.th", fname).group(1)
            for fname in file_names
        ]
        # We should have checkpoints at the end of each epoch and during each, e.g.
        # [0.timestamp, 0, 1.timestamp, 1]
        assert len(epochs) == 4
        assert epochs[3] == "1"
        assert "." in epochs[0]

        # Now make certain we can restore from timestamped checkpoint.
        # To do so, remove the checkpoint from the end of epoch 1&2, so
        # that we are forced to restore from the timestamped checkpoints.
        for k in range(2):
            os.remove(
                os.path.join(self.TEST_DIR,
                             "model_state_epoch_{}.th".format(k)))
            os.remove(
                os.path.join(self.TEST_DIR,
                             "training_state_epoch_{}.th".format(k)))
        os.remove(os.path.join(self.TEST_DIR, "best.th"))

        restore_trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            self.data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                      model_save_interval=0.0001),
        )
        epoch = restore_trainer._restore_checkpoint()
        assert epoch == 2
        # One batch per epoch.
        assert restore_trainer._batch_num_total == 2
Ejemplo n.º 14
0
def benchmark_xlmr_mdl():

    from allennlp.data import DataLoader
    from allennlp.training.util import evaluate

    xlmr = load_xlmr_coref_model()

    instances = xlmr.dataset_reader.load_dataset(testset)
    data_loader = SimpleDataLoader(instances, 1)
    data_loader.index_with(xlmr.model.vocab)

    start = time.time()

    metrics = evaluate(xlmr.model, data_loader)

    print('**XLM-R model**')
    print_speed_performance(start, num_sentences, num_tokens)
    print('Precision : ', metrics['coref_precision'])
    print('Recall : ', metrics['coref_recall'])
    print('F1 : ', metrics['coref_f1'])
    print('Mention Recall : ', metrics['mention_recall'])
Ejemplo n.º 15
0
        class SlowDataLoader:
            data_loader = SimpleDataLoader(self.instances, batch_size=2)

            def __iter__(self):
                time.sleep(2.5)
                return iter(self.data_loader)

            def __len__(self):
                return len(self.data_loader)

            def set_target_device(self, _):
                pass
Ejemplo n.º 16
0
 def test_sanity_check_callback(self):
     model_with_bias = FakeModelForTestingNormalizationBiasVerification(
         use_bias=True)
     inst = Instance({"x": TensorField(torch.rand(3, 1, 4))})
     data_loader = SimpleDataLoader([inst, inst], 2)
     trainer = GradientDescentTrainer(
         model_with_bias,
         self.optimizer,
         data_loader,
         num_epochs=1,
         serialization_dir=self.TEST_DIR,
         callbacks=[SanityChecksCallback(serialization_dir=self.TEST_DIR)],
     )
     with pytest.raises(SanityCheckError):
         trainer.train()
Ejemplo n.º 17
0
    def test_trainer_can_log_learning_rates_tensorboard(self):
        data_loader = SimpleDataLoader(self.instances, 4)
        trainer = GradientDescentTrainer(
            self.model,
            self.optimizer,
            data_loader,
            num_epochs=2,
            serialization_dir=self.TEST_DIR,
            callbacks=[
                TensorBoardCallback(
                    serialization_dir=self.TEST_DIR,
                    summary_interval=2,
                    should_log_learning_rate=True,
                )
            ],
        )

        trainer.train()
Ejemplo n.º 18
0
    def test_regularization(self):
        penalty = self.model.get_regularization_penalty()
        assert penalty is None

        data_loader = SimpleDataLoader(self.instances, batch_size=32)
        trainer = GradientDescentTrainer(self.model, None,
                                         data_loader)  # optimizer,

        # You get a RuntimeError if you call `model.forward` twice on the same inputs.
        # The data and config are such that the whole dataset is one batch.
        training_batch = next(iter(data_loader))
        validation_batch = next(iter(data_loader))

        training_loss = trainer.batch_outputs(
            training_batch, for_training=True)["loss"].item()
        validation_loss = trainer.batch_outputs(
            validation_batch, for_training=False)["loss"].item()

        # Training loss should have the regularization penalty, but validation loss should not.
        numpy.testing.assert_almost_equal(training_loss, validation_loss)
Ejemplo n.º 19
0
 def test_trainer_respects_epoch_size_smaller_tnan_total(self):
     batches_per_epoch = 1
     num_epochs = 2
     data_loader_smaller_epoch = SimpleDataLoader(
         self.instances,
         2,
         batches_per_epoch=batches_per_epoch,
     )
     trainer = GradientDescentTrainer(
         self.model,
         self.optimizer,
         data_loader_smaller_epoch,
         validation_data_loader=self.validation_data_loader,
         num_epochs=num_epochs,
         serialization_dir=self.TEST_DIR,
     )
     assert trainer._batch_num_total == 0
     metrics = trainer.train()
     epoch = metrics["epoch"]
     assert epoch == num_epochs - 1
     assert trainer._batch_num_total == num_epochs * batches_per_epoch
Ejemplo n.º 20
0
    def test_sanity_check_default(self):
        model_with_bias = FakeModelForTestingNormalizationBiasVerification(use_bias=True)
        inst = Instance({"x": TensorField(torch.rand(3, 1, 4))})
        data_loader = SimpleDataLoader([inst, inst], 2)
        trainer = GradientDescentTrainer.from_partial_objects(
            model_with_bias,
            serialization_dir=self.TEST_DIR,
            data_loader=data_loader,
            num_epochs=1,
        )
        with pytest.raises(SanityCheckError):
            trainer.train()

        trainer = GradientDescentTrainer.from_partial_objects(
            model_with_bias,
            serialization_dir=self.TEST_DIR,
            data_loader=data_loader,
            num_epochs=1,
            run_sanity_checks=False,
        )

        # Check is not run, so no failure.
        trainer.train()
Ejemplo n.º 21
0
class SimpleInfluence(InfluenceInterpreter):
    """
    Registered as an `InfluenceInterpreter` with name "simple-influence".

    This goes through every example in the train set to calculate the influence score. It uses
    [LiSSA (Linear time Stochastic Second-Order Algorithm)](https://api.semanticscholar.org/CorpusID:10569090)
    to approximate the inverse of the Hessian used for the influence score calculation.

    # Parameters

    lissa_batch_size : `int`, optional (default = `8`)
        The batch size to use for LiSSA.
        According to [Koh, P.W., & Liang, P. (2017)](https://api.semanticscholar.org/CorpusID:13193974),
        it is better to use batched samples for approximation for better stability.

    damping : `float`, optional (default = `3e-3`)
        This is a hyperparameter for LiSSA.
        A damping termed added in case the approximated Hessian (during LiSSA) has
        negative eigenvalues.

    num_samples : `int`, optional (default = `1`)
        This is a hyperparameter for LiSSA that we
        determine how many rounds of the recursion process we would like to run for approxmation.

    recursion_depth : `Union[float, int]`, optional (default = `0.25`)
        This is a hyperparameter for LiSSA that
        determines the recursion depth we would like to go through.
        If a `float`, it means X% of the training examples.
        If an `int`, it means recurse for X times.

    scale : `float`, optional, (default = `1e4`)
        This is a hyperparameter for LiSSA to tune such that the Taylor expansion converges.
        It is applied to scale down the loss during LiSSA to ensure that `H <= I`,
        where `H` is the Hessian and `I` is the identity matrix.

        See footnote 2 of [Koh, P.W., & Liang, P. (2017)](https://api.semanticscholar.org/CorpusID:13193974).

    !!! Note
        We choose the same default values for the LiSSA hyperparameters as
        [Han, Xiaochuang et al. (2020)](https://api.semanticscholar.org/CorpusID:218628619).
    """
    def __init__(
        self,
        model: Model,
        train_data_path: DatasetReaderInput,
        train_dataset_reader: DatasetReader,
        *,
        test_dataset_reader: Optional[DatasetReader] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: List[str] = None,
        cuda_device: int = -1,
        lissa_batch_size: int = 8,
        damping: float = 3e-3,
        num_samples: int = 1,
        recursion_depth: Union[float, int] = 0.25,
        scale: float = 1e4,
    ) -> None:
        super().__init__(
            model=model,
            train_data_path=train_data_path,
            train_dataset_reader=train_dataset_reader,
            test_dataset_reader=test_dataset_reader,
            train_data_loader=train_data_loader,
            test_data_loader=test_data_loader,
            params_to_freeze=params_to_freeze,
            cuda_device=cuda_device,
        )

        self._lissa_dataloader = SimpleDataLoader(
            list(self._train_loader.iter_instances()),
            lissa_batch_size,
            shuffle=True,
            vocab=self.vocab,
        )
        self._lissa_dataloader.set_target_device(self.device)
        if isinstance(recursion_depth, float) and recursion_depth > 0.0:
            self._lissa_dataloader.batches_per_epoch = int(
                len(self._lissa_dataloader) * recursion_depth)
        elif isinstance(recursion_depth, int) and recursion_depth > 0:
            self._lissa_dataloader.batches_per_epoch = recursion_depth
        else:
            raise ValueError(
                "'recursion_depth' should be a positive int or float")

        self._damping = damping
        self._num_samples = num_samples
        self._recursion_depth = recursion_depth
        self._scale = scale

    @overrides
    def _calculate_influence_scores(
            self, test_instance: Instance, test_loss: float,
            test_grads: Sequence[torch.Tensor]) -> List[float]:
        # Approximate the inverse of Hessian-Vector Product through LiSSA
        inv_hvp = get_inverse_hvp_lissa(
            test_grads,
            self.model,
            self.used_params,
            self._lissa_dataloader,
            self._damping,
            self._num_samples,
            self._scale,
        )
        return [
            # dL_test * d theta as in 2.2 of [https://arxiv.org/pdf/2005.06676.pdf]
            # TODO (epwalsh): should we divide `x.grads` by `self._scale`?
            torch.dot(inv_hvp, _flatten_tensors(x.grads)).item()
            for x in Tqdm.tqdm(self.train_instances,
                               desc="scoring train instances")
        ]
Ejemplo n.º 22
0
    train_data, dev_data = read_data(dataset_reader)

    vocab = build_vocab(train_data + dev_data)
    model = build_model(vocab)

    train_loader, dev_loader = build_data_loaders(train_data, dev_data)
    train_loader.index_with(vocab)
    dev_loader.index_with(vocab)

    # You obviously won't want to create a temporary file for your training
    # results, but for execution in binder for this guide, we need to do this.
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = build_trainer(model, serialization_dir, train_loader, dev_loader)
        trainer.train()

    return model, dataset_reader


# We've copied the training loop from an earlier example, with updated model
# code, above in the Setup section. We run the training loop to get a trained
# model.
model, dataset_reader = run_training_loop()

# Now we can evaluate the model on a new dataset.
test_data = list(dataset_reader.read("quick_start/data/movie_review/test.tsv"))
data_loader = SimpleDataLoader(test_data, 8)
data_loader.index_with(model.vocab)

results = evaluate(model, data_loader)
print(results)
Ejemplo n.º 23
0
class TaggerTrainer:
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)
        
        bert_token_indexers = PretrainedTransformerIndexer(model_name=self.config.model_name)
        reader = SequenceTaggingDatasetReader(token_indexers={"tokens": bert_token_indexers})

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances, self.config.batch_size, shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances, self.config.batch_size, shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)
        
        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()
    
    def init_crf_model(self) -> Model:
        """init crf tagger model
        """
        # 1. import related modules
        from allennlp
        bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
        bert_text_field_embedder
        tagger = SimpleTagger(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={
                    'tokens': bert_text_field_embedder
                }
            ),
            encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
            verbose_metrics=True,
            calculate_span_f1=True,
            label_encoding="BMES",
        )
        
        tagger.to(device=self.config.device)
        return tagger
    
    def init_model(self) -> Model:
        """build the model

        Args:
            vocab (Vocabulary): the vocabulary of corpus

        Returns:
            Model: the final models
        """
        bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
        bert_text_field_embedder
        tagger = SimpleTagger(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={
                    'tokens': bert_text_field_embedder
                }
            ),
            encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
            verbose_metrics=True,
            calculate_span_f1=True,
            label_encoding="BMES",
        )
        
        tagger.to(device=self.config.device)
        return tagger
    
    def init_trainer(self) -> Trainer:
        parameters = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad]
        optimizer = AdamOptimizer(parameters, lr=self.config.lr)  # type: ignore
        trainer = GradientDescentTrainer(
            model=self.model,
            serialization_dir='./output',
            data_loader=self.train_data_loader,
            validation_data_loader=self.dev_data_loader,
            num_epochs=self.config.epoch,
            optimizer=optimizer,
            cuda_device=self.config.device,
        )
        return trainer
    
    def train(self):
        self.trainer.train()
Ejemplo n.º 24
0
                               vocab=vocab)
    tgt_char_encoder = TokenCharactersEncoder(embedding=tgt_char_embedding,
                                              encoder=GruSeq2VecEncoder(input_size=args.emb_dim,
                                                                        hidden_size=args.hid_dim))
    src_embedders = BasicTextFieldEmbedder({
        "tokens": src_embedding,
        "character_tokens": src_char_encoder
        })
    tgt_embedders = BasicTextFieldEmbedder({
        "tokens": tgt_embedding,
        "character_tokens": tgt_char_encoder
        })
    
    train_loader = SimpleDataLoader.from_dataset_reader(
                                                      reader=dataset_reader, 
                                                      data_path=args.train_file,
                                                      batch_size=args.bs,
                                                      shuffle=True)
    train_loader.index_with(vocab)
    val_loader = SimpleDataLoader.from_dataset_reader(reader=dataset_reader,
                                                      data_path=args.valid_file,
                                                      batch_size=args.bs)
    val_loader.index_with(vocab)
    model = create_seq2seqmodel(vocab, src_embedders=src_embedders, tgt_embedders=tgt_embedders, hidden_dim=args.hid_dim,
                                max_decoding_steps=args.maxlen, device=device)
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"The model has {count_parameters(model)} parameters.")

    save_dir = None
    if args.save:
def main(serialization_directory: str,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(
        os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config["dataset_reader"])
    evaluation_data_path = data if data else config["validation_data_path"]

    archive = load_archive(os.path.join(serialization_directory,
                                        "model.tar.gz"),
                           cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory,
                                        prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory,
                                  prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    dataset = list(dataset_reader.read(evaluation_data_path))

    with torch.autograd.no_grad():
        loader = SimpleDataLoader(dataset, 32)
        model_predictions: List[List[str]] = []
        for batch in Tqdm.tqdm(loader):
            batch = move_to_device(batch, device)
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(dataset, model_predictions):
            fields = instance.fields
            verb_index = fields["metadata"]["verb_index"]
            gold_tags = fields["metadata"]["gold_tags"]
            sentence = fields["metadata"]["words"]
            write_to_conll_eval_file(prediction_file, gold_file, verb_index,
                                     sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
Ejemplo n.º 26
0
class TaggerTrainer:
    def __init__(self) -> None:
        self.config: Config = Config().parse_args(known_only=True)

        bert_token_indexers = PretrainedTransformerIndexer(
            model_name=self.config.model_name)
        bert_tokenizer = PretrainedTransformerTokenizer(
            model_name=self.config.model_name)
        reader = TextClassificationJsonReader(
            token_indexers={"tokens": bert_token_indexers},
            tokenizer=bert_tokenizer)

        train_instances = list(reader.read(self.config.train_file))
        dev_instances = list(reader.read(self.config.dev_file))
        test_instances = list(reader.read(self.config.test_file))

        self.vocab: Vocabulary = Vocabulary.from_instances(train_instances)

        # 2. init the data loader
        self.train_data_loader = SimpleDataLoader(train_instances,
                                                  self.config.batch_size,
                                                  shuffle=True)
        self.dev_data_loader = SimpleDataLoader(dev_instances,
                                                self.config.batch_size,
                                                shuffle=False)
        self.train_data_loader.index_with(self.vocab)
        self.dev_data_loader.index_with(self.vocab)

        # 3. init the model
        self.model = self.init_model()
        self.trainer = self.init_trainer()

    def init_model(self) -> Model:
        """build the model

        Args:
            vocab (Vocabulary): the vocabulary of corpus

        Returns:
            Model: the final models
        """
        bert_text_field_embedder = PretrainedTransformerEmbedder(
            model_name=self.config.model_name)
        tagger = BasicClassifier(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={'tokens': bert_text_field_embedder}),
            seq2vec_encoder=ClsPooler(
                embedding_dim=bert_text_field_embedder.get_output_dim()),
        )
        tagger.to(device=self.config.device)
        return tagger

    def init_trainer(self) -> Trainer:
        parameters = [(n, p) for n, p in self.model.named_parameters()
                      if p.requires_grad]

        group_parameter_group = [(['_text_field_embedder.*'], {
            'lr': self.config.lr
        }), (['_classification_layer.*'], {
            'lr': self.config.classifier_lr
        })]

        optimizer = AdamOptimizer(parameters,
                                  parameter_groups=group_parameter_group,
                                  lr=self.config.lr)  # type: ignore

        trainer = GradientDescentTrainer(
            model=self.model,
            serialization_dir='./output',
            data_loader=self.train_data_loader,
            validation_data_loader=self.dev_data_loader,
            num_epochs=self.config.epoch,
            optimizer=optimizer,
            cuda_device=self.config.device,
        )
        return trainer

    def train(self):
        self.trainer.train()