Beispiel #1
0
def create_dataloader(
    dataset: InstancesDataset,
    batch_size: int,
    data_bucketing: bool = False,
    batches_per_epoch: Optional[int] = None,
) -> PyTorchDataLoader:
    """Returns a pytorch DataLoader for AllenNLP

    Parameters
    ----------
    dataset
        The data set for the DataLoader
    batch_size
        Size of the batch.
    data_bucketing
        If enabled, try to apply data bucketing over training batches.
    batches_per_epoch
        Determines the number of batches after which an epoch ends.
        If the number is smaller than the total amount of batches in your data,
        the second "epoch" will take off where the first "epoch" ended.
        If this is `None`, then an epoch is set to be one full pass through your data.

    Returns
    -------
    data_loader
    """
    return (PyTorchDataLoader(
        dataset,
        batch_sampler=BucketBatchSampler(data_source=dataset,
                                         batch_size=batch_size),
        batches_per_epoch=batches_per_epoch,
    ) if data_bucketing and not isinstance(dataset, IterableDataset) else
            PyTorchDataLoader(dataset,
                              batch_size=batch_size,
                              batches_per_epoch=batches_per_epoch))
    def test_from_params_in_trainer(self):
        # This is more of an integration test, making sure that a bunch of pieces fit together
        # correctly, but it matters most for this learning rate scheduler, so we're testing it here.
        params = Params(
            {
                "num_epochs": 5,
                "learning_rate_scheduler": {
                    "type": "slanted_triangular",
                    "gradual_unfreezing": True,
                    "discriminative_fine_tuning": True,
                    "decay_factor": 0.5,
                },
            }
        )
        # The method called in the logic below only checks the length of this list, not its
        # contents, so this should be safe.
        instances = AllennlpDataset([1] * 40)
        optim = self._get_optimizer()
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=PyTorchDataLoader(instances, batch_size=10),
        )
        assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular)

        # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and
        # that num_steps_per_epoch is computed and passed correctly.  This logic happens inside of
        # `Trainer.from_partial_objects`.
        assert trainer._learning_rate_scheduler.num_epochs == 5
        assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4

        # And we'll do one more to make sure that we can override num_epochs in the scheduler if we
        # really want to.  Not sure why you would ever want to in this case; this is just testing
        # the functionality.
        params = Params(
            {
                "num_epochs": 5,
                "learning_rate_scheduler": {
                    "type": "slanted_triangular",
                    "num_epochs": 3,
                    "gradual_unfreezing": True,
                    "discriminative_fine_tuning": True,
                    "decay_factor": 0.5,
                },
            }
        )
        trainer = Trainer.from_params(
            model=self.model,
            optimizer=Lazy(lambda **kwargs: optim),
            serialization_dir=self.TEST_DIR,
            params=params,
            data_loader=PyTorchDataLoader(instances, batch_size=10),
        )
        assert trainer._learning_rate_scheduler.num_epochs == 3
Beispiel #3
0
def build_data_loaders(
    train_data: torch.utils.data.Dataset, dev_data: torch.utils.data.Dataset
) -> Tuple[allennlp.data.PyTorchDataLoader, allennlp.data.PyTorchDataLoader]:
    train_loader = PyTorchDataLoader(train_data,
                                     batch_size=batch_size,
                                     shuffle=True)
    dev_loader = PyTorchDataLoader(dev_data,
                                   batch_size=num_virtual_models,
                                   shuffle=False)
    return train_loader, dev_loader
Beispiel #4
0
def build_data_loaders(
        train_data: torch.utils.data.Dataset,
        dev_data: torch.utils.data.Dataset
) -> Tuple[allennlp.data.DataLoader, allennlp.data.DataLoader]:
    # Note that DataLoader is imported from allennlp above, *not* torch.
    # We need to get the allennlp-specific collate function, which is
    # what actually does indexing and batching.
    batch_size = 8
    train_loader = PyTorchDataLoader(train_data, batch_size=batch_size, shuffle=True)
    dev_loader = PyTorchDataLoader(dev_data, batch_size=batch_size, shuffle=False)
    return train_loader, dev_loader
Beispiel #5
0
 def test_can_optimise_model_with_dense_and_sparse_params(self):
     optimizer_params = Params({"type": "dense_sparse_adam"})
     parameters = [[n, p] for n, p in self.model.named_parameters()
                   if p.requires_grad]
     optimizer = Optimizer.from_params(model_parameters=parameters,
                                       params=optimizer_params)
     self.instances.index_with(self.vocab)
     GradientDescentTrainer(self.model, optimizer,
                            PyTorchDataLoader(self.instances, 2)).train()
    def test_regularization(self):
        penalty = self.model.get_regularization_penalty()
        assert penalty is None

        data_loader = PyTorchDataLoader(self.instances, batch_size=32)
        trainer = GradientDescentTrainer(self.model, None, data_loader)  # optimizer,

        # You get a RuntimeError if you call `model.forward` twice on the same inputs.
        # The data and config are such that the whole dataset is one batch.
        training_batch = next(iter(data_loader))
        validation_batch = next(iter(data_loader))

        training_loss = trainer.batch_outputs(training_batch, for_training=True)["loss"].item()
        validation_loss = trainer.batch_outputs(validation_batch, for_training=False)["loss"].item()

        # Training loss should have the regularization penalty, but validation loss should not.
        numpy.testing.assert_almost_equal(training_loss, validation_loss)
Beispiel #7
0
    def test_evaluation(self) -> Dict[str, Any]:
        """
        Evaluates the model against the test dataset (if defined)

        Returns
        -------
        Test metrics information

        """
        test_data = self._test
        if not test_data:
            return {}

        self.__LOGGER.info(
            "The model will be evaluated using the best epoch weights.")
        return evaluate(
            self._pipeline._model,
            data_loader=PyTorchDataLoader(
                test_data, batch_size=self._trainer_config.batch_size),
            cuda_device=self._trainer.cuda_device,
            batch_weight_key=self._batch_weight_key,
        )
Beispiel #8
0
    torch.backends.cudnn.deterministic = True

    # 学習データの読み込み

    dataset_reader = IntentSlotDatasetReader()

    train_data = dataset_reader.read('data/training')
    valid_data = dataset_reader.read('data/validation')

    vocab = Vocabulary.from_instances(train_data+valid_data)
    vocab.save_to_files('vocab')

    train_data.index_with(vocab)
    valid_data.index_with(vocab)
    
    train_loader = PyTorchDataLoader(train_data, batch_size=8, shuffle=True)
    valid_loader = PyTorchDataLoader(valid_data, batch_size=8, shuffle=False)

    # モデルの作成

    embedder = BasicTextFieldEmbedder(
        {'tokens': Embedding(
            embedding_dim=10,
            num_embeddings=vocab.get_vocab_size('tokens'))})

    encoder = LstmSeq2VecEncoder(10, 32, bidirectional=True)
    # encoder = BagOfEmbeddingsEncoder(embedding_dim=10)

    model = IntentEstimator(vocab, embedder, encoder)
    model.cuda()
train_dataset.index_with(vocab)
validation_dataset.index_with(vocab)

# 単語エンベディングの作成
embedding = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=100)
# テキストの特徴ベクトルの作成
text_embedder = BasicTextFieldEmbedder({"tokens": embedding})
encoder = BagOfEmbeddingsEncoder(embedding_dim=100)

# 文書分類器の作成
model = BasicClassifier(vocab=vocab,
                        text_field_embedder=text_embedder,
                        seq2vec_encoder=encoder)

# データローダ
train_loader = PyTorchDataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = PyTorchDataLoader(validation_dataset,
                                      batch_size=32,
                                      shuffle=False)

# GPU上にモデルをコピー
# model = model.cuda()

# オプティマイザの作成
optimizer = AdamOptimizer(model.named_parameters())

# トレイナの作成
trainer = GradientDescentTrainer(model=model,
                                 optimizer=optimizer,
                                 data_loader=train_loader,
                                 validation_data_loader=validation_loader,
Beispiel #10
0
    serialization_dir = args.serialization_dir
    with open(args.config, "r") as config_f:
        params = Params(json.loads(config_f.read()))

    # 1. setting up dataset, vocab and dataloaders
    dataset_reader = DSLSharedTaskDataset()

    train_dataset = dataset_reader.read(params["train_data_path"])
    valid_dataset = dataset_reader.read(params["validation_data_path"])

    vocab = build_vocab(train_dataset + valid_dataset)
    train_dataset.index_with(vocab)
    valid_dataset.index_with(vocab)
    data_loader_params = params.pop('data_loader')
    batch_size = data_loader_params['batch_size']
    train_loader = DataLoader.from_params(dataset=train_dataset,
                                          params=data_loader_params)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)

    # 2. setting up model and training details

    # model = build_model(vocab)
    model = Model.from_params(vocab=vocab, params=params["model"])
    model.cuda()
    trainer = Trainer.from_params(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=valid_loader,
        params=params['trainer'],
Beispiel #11
0
How to Fine-Tune BERT for Text Classification?: https://arxiv.org/pdf/1905.05583.pdf
lr:2e-5
batch:32
"""

batch_size = 4
embedding_dim = 256
num_epoch = 100
lr = 0.00002
num_labels = 2
grad_accum = 8

import datetime
now = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

serialization_dir = f"{cur_dir}/checkpoints_clss/lr_" + str(
    lr) + "_" + now + "_seed" + str(seed) + "_" + ("single" if not args.pseudo
                                                   else "pseudo")
vocab_dir = serialization_dir + "/vocab"

model, dataset_reader = run_training_loop()
test_data = dataset_reader.read(TEST_PATH)
test_data.index_with(model.vocab)
data_loader = PyTorchDataLoader(test_data,
                                batch_size=batch_size,
                                shuffle=False)

results = evaluate(model, data_loader, cuda_device=0)
print(results)
print("batch_size:{}, num_epoch:{}, lr:{}, grad_accum:{}".format(
    batch_size, num_epoch, lr, grad_accum))
Beispiel #12
0
def main():

    opts = options()

    # select a bert specific indexer
    if opts.with_bert:
        from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer
        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=opts.bert_name, max_length=opts.bert_max_len)
    # separate by spaces
    else:
        from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
        indexer = SingleIdTokenIndexer()

    reader = TaggerDatasetReader(token_indexers={"tokens": indexer})
    train_dataset = reader.read(opts.train_file)
    valid_dataset = reader.read(opts.valid_file)
    params = Tagger.opts2params(opts)

    with open(opts.model_dir + "/params.pkl", mode='wb') as f:
        pickle.dump(params, f)

    vocab = Vocabulary.from_instances(train_dataset + valid_dataset,
                                      min_count={'tokens': opts.min_freq})
    train_dataset.index_with(vocab)
    valid_dataset.index_with(vocab)
    train_data_loader = PyTorchDataLoader(train_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              train_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))
    valid_data_loader = PyTorchDataLoader(valid_dataset,
                                          batch_sampler=BucketBatchSampler(
                                              valid_dataset,
                                              batch_size=opts.batch_size,
                                              sorting_keys=["tokens"]))

    model = Tagger.build(params, vocab)
    if torch.cuda.is_available():
        cuda_device = opts.gpuid
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    # select an optimizer for fine-tuning
    if opts.with_bert:
        from allennlp.training.optimizers import HuggingfaceAdamWOptimizer
        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = HuggingfaceAdamWOptimizer(model_parameters=parameters,
                                              lr=0.0003,
                                              parameter_groups=[
                                                  ([".*transformer.*"], {
                                                      "lr": 1e-05
                                                  })
                                              ])
    # optimizer for random initialization
    else:
        import torch.optim as optim
        optimizer = optim.Adam(model.parameters(), lr=0.001)

    trainer = GradientDescentTrainer(
        model=model,
        optimizer=optimizer,
        data_loader=train_data_loader,
        validation_data_loader=valid_data_loader,
        num_epochs=1,
        use_amp=opts.use_amp,
        num_gradient_accumulation_steps=opts.num_gradient_accumulation_steps,
        cuda_device=cuda_device)

    vocab.save_to_files(opts.model_dir + "/vocab")

    best_f1 = 0.0
    for i in range(opts.epochs):
        epoch = i + 1
        print('Epoch: {}'.format(epoch))
        info = trainer.train()
        print(info)
        if info["validation_accuracy"] > best_f1:
            best_f1 = info["validation_accuracy"]
            with open(opts.model_dir + "/save_" + str(epoch) + ".save",
                      'wb') as f_model:
                torch.save(model.state_dict(), f_model)
Beispiel #13
0
batch_size = 2
embedding_dim = 200
num_epoch = 75
lr = 0.0001
num_labels = 2
grad_accum = 16
weight_decay = 0.0001
validation_metric = "+f1-measure-overall"
num_serialized_models_to_keep = 3
grad_norm = 5.0
patience = 25

import datetime
now = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

serialization_dir = f"{cur_dir}/checkpoints_ner/lr_" + str(
    lr) + "_" + now + "_seed" + str(seed) + "_" + ("single" if not args.pseudo
                                                   else "pseudo")
vocab_dir = serialization_dir + "/vocab"

model, dataset_reader = run_training_loop()
test_data = dataset_reader.read(TEST_PATH)
test_data.index_with(model.vocab)
data_loader = PyTorchDataLoader(test_data, batch_size=batch_size)

results = evaluate(model, data_loader, cuda_device=0)
print(results)
print("batch_size:{}, num_epoch:{}, lr:{}, grad_accum:{}".format(
    batch_size, num_epoch, lr, grad_accum))
def train(train, validation, optimizer_name):
    batch_size = 32
    learning_rate = 0.01
    max_iterations = 100

    token_indexer = {
        "tokens": SingleIdTokenIndexer(),
        "token_characters": TokenCharactersIndexer(min_padding_length=3),
    }

    reader = Conll2003DatasetReader(token_indexer)

    train_dataset = reader.read(train)

    validation_dataset = reader.read(validation)

    # Once we've read in the datasets, we use them to create our <code>Vocabulary</code>
    # (that is, the mapping[s] from tokens / labels to ids).
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    # Set variables

    model = get_model(vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    if optimizer_name == 'adahessian':
        optimizer = Adahessian(model.parameters(),
                               lr=learning_rate,
                               block_length=2)
    elif optimizer_name == 'ranger':
        optimizer = Ranger(model.parameters(), lr=learning_rate)
    else:
        raise AttributeError()

    train_dataset.index_with(vocab)
    validation_dataset.index_with(vocab)

    scheduler = ReduceOnPlateauLearningRateScheduler(optimizer,
                                                     factor=0.5,
                                                     patience=4,
                                                     mode="min",
                                                     verbose=True)

    dl = PyTorchDataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
    )

    dl_validation = PyTorchDataLoader(validation_dataset,
                                      batch_size=batch_size,
                                      shuffle=False)

    trainer_model = AdaTrainer

    trainer = trainer_model(
        model=model,
        optimizer=optimizer,
        # iterator=iterator,
        grad_norm=10.0,
        data_loader=dl,
        validation_data_loader=dl_validation,
        learning_rate_scheduler=scheduler,
        patience=8,
        num_epochs=max_iterations,
        cuda_device=cuda_device,
    )
    train_metrics = trainer.train()
    print(train_metrics)