def create_dataloader( dataset: InstancesDataset, batch_size: int, data_bucketing: bool = False, batches_per_epoch: Optional[int] = None, ) -> PyTorchDataLoader: """Returns a pytorch DataLoader for AllenNLP Parameters ---------- dataset The data set for the DataLoader batch_size Size of the batch. data_bucketing If enabled, try to apply data bucketing over training batches. batches_per_epoch Determines the number of batches after which an epoch ends. If the number is smaller than the total amount of batches in your data, the second "epoch" will take off where the first "epoch" ended. If this is `None`, then an epoch is set to be one full pass through your data. Returns ------- data_loader """ return (PyTorchDataLoader( dataset, batch_sampler=BucketBatchSampler(data_source=dataset, batch_size=batch_size), batches_per_epoch=batches_per_epoch, ) if data_bucketing and not isinstance(dataset, IterableDataset) else PyTorchDataLoader(dataset, batch_size=batch_size, batches_per_epoch=batches_per_epoch))
def test_from_params_in_trainer(self): # This is more of an integration test, making sure that a bunch of pieces fit together # correctly, but it matters most for this learning rate scheduler, so we're testing it here. params = Params( { "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, } ) # The method called in the logic below only checks the length of this list, not its # contents, so this should be safe. instances = AllennlpDataset([1] * 40) optim = self._get_optimizer() trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, data_loader=PyTorchDataLoader(instances, batch_size=10), ) assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular) # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and # that num_steps_per_epoch is computed and passed correctly. This logic happens inside of # `Trainer.from_partial_objects`. assert trainer._learning_rate_scheduler.num_epochs == 5 assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4 # And we'll do one more to make sure that we can override num_epochs in the scheduler if we # really want to. Not sure why you would ever want to in this case; this is just testing # the functionality. params = Params( { "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "num_epochs": 3, "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, } ) trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, data_loader=PyTorchDataLoader(instances, batch_size=10), ) assert trainer._learning_rate_scheduler.num_epochs == 3
def build_data_loaders( train_data: torch.utils.data.Dataset, dev_data: torch.utils.data.Dataset ) -> Tuple[allennlp.data.PyTorchDataLoader, allennlp.data.PyTorchDataLoader]: train_loader = PyTorchDataLoader(train_data, batch_size=batch_size, shuffle=True) dev_loader = PyTorchDataLoader(dev_data, batch_size=num_virtual_models, shuffle=False) return train_loader, dev_loader
def build_data_loaders( train_data: torch.utils.data.Dataset, dev_data: torch.utils.data.Dataset ) -> Tuple[allennlp.data.DataLoader, allennlp.data.DataLoader]: # Note that DataLoader is imported from allennlp above, *not* torch. # We need to get the allennlp-specific collate function, which is # what actually does indexing and batching. batch_size = 8 train_loader = PyTorchDataLoader(train_data, batch_size=batch_size, shuffle=True) dev_loader = PyTorchDataLoader(dev_data, batch_size=batch_size, shuffle=False) return train_loader, dev_loader
def test_can_optimise_model_with_dense_and_sparse_params(self): optimizer_params = Params({"type": "dense_sparse_adam"}) parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(model_parameters=parameters, params=optimizer_params) self.instances.index_with(self.vocab) GradientDescentTrainer(self.model, optimizer, PyTorchDataLoader(self.instances, 2)).train()
def test_regularization(self): penalty = self.model.get_regularization_penalty() assert penalty is None data_loader = PyTorchDataLoader(self.instances, batch_size=32) trainer = GradientDescentTrainer(self.model, None, data_loader) # optimizer, # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(iter(data_loader)) validation_batch = next(iter(data_loader)) training_loss = trainer.batch_outputs(training_batch, for_training=True)["loss"].item() validation_loss = trainer.batch_outputs(validation_batch, for_training=False)["loss"].item() # Training loss should have the regularization penalty, but validation loss should not. numpy.testing.assert_almost_equal(training_loss, validation_loss)
def test_evaluation(self) -> Dict[str, Any]: """ Evaluates the model against the test dataset (if defined) Returns ------- Test metrics information """ test_data = self._test if not test_data: return {} self.__LOGGER.info( "The model will be evaluated using the best epoch weights.") return evaluate( self._pipeline._model, data_loader=PyTorchDataLoader( test_data, batch_size=self._trainer_config.batch_size), cuda_device=self._trainer.cuda_device, batch_weight_key=self._batch_weight_key, )
torch.backends.cudnn.deterministic = True # 学習データの読み込み dataset_reader = IntentSlotDatasetReader() train_data = dataset_reader.read('data/training') valid_data = dataset_reader.read('data/validation') vocab = Vocabulary.from_instances(train_data+valid_data) vocab.save_to_files('vocab') train_data.index_with(vocab) valid_data.index_with(vocab) train_loader = PyTorchDataLoader(train_data, batch_size=8, shuffle=True) valid_loader = PyTorchDataLoader(valid_data, batch_size=8, shuffle=False) # モデルの作成 embedder = BasicTextFieldEmbedder( {'tokens': Embedding( embedding_dim=10, num_embeddings=vocab.get_vocab_size('tokens'))}) encoder = LstmSeq2VecEncoder(10, 32, bidirectional=True) # encoder = BagOfEmbeddingsEncoder(embedding_dim=10) model = IntentEstimator(vocab, embedder, encoder) model.cuda()
train_dataset.index_with(vocab) validation_dataset.index_with(vocab) # 単語エンベディングの作成 embedding = Embedding(num_embeddings=vocab.get_vocab_size(), embedding_dim=100) # テキストの特徴ベクトルの作成 text_embedder = BasicTextFieldEmbedder({"tokens": embedding}) encoder = BagOfEmbeddingsEncoder(embedding_dim=100) # 文書分類器の作成 model = BasicClassifier(vocab=vocab, text_field_embedder=text_embedder, seq2vec_encoder=encoder) # データローダ train_loader = PyTorchDataLoader(train_dataset, batch_size=32, shuffle=True) validation_loader = PyTorchDataLoader(validation_dataset, batch_size=32, shuffle=False) # GPU上にモデルをコピー # model = model.cuda() # オプティマイザの作成 optimizer = AdamOptimizer(model.named_parameters()) # トレイナの作成 trainer = GradientDescentTrainer(model=model, optimizer=optimizer, data_loader=train_loader, validation_data_loader=validation_loader,
serialization_dir = args.serialization_dir with open(args.config, "r") as config_f: params = Params(json.loads(config_f.read())) # 1. setting up dataset, vocab and dataloaders dataset_reader = DSLSharedTaskDataset() train_dataset = dataset_reader.read(params["train_data_path"]) valid_dataset = dataset_reader.read(params["validation_data_path"]) vocab = build_vocab(train_dataset + valid_dataset) train_dataset.index_with(vocab) valid_dataset.index_with(vocab) data_loader_params = params.pop('data_loader') batch_size = data_loader_params['batch_size'] train_loader = DataLoader.from_params(dataset=train_dataset, params=data_loader_params) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) # 2. setting up model and training details # model = build_model(vocab) model = Model.from_params(vocab=vocab, params=params["model"]) model.cuda() trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_loader, validation_data_loader=valid_loader, params=params['trainer'],
How to Fine-Tune BERT for Text Classification?: https://arxiv.org/pdf/1905.05583.pdf lr:2e-5 batch:32 """ batch_size = 4 embedding_dim = 256 num_epoch = 100 lr = 0.00002 num_labels = 2 grad_accum = 8 import datetime now = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) serialization_dir = f"{cur_dir}/checkpoints_clss/lr_" + str( lr) + "_" + now + "_seed" + str(seed) + "_" + ("single" if not args.pseudo else "pseudo") vocab_dir = serialization_dir + "/vocab" model, dataset_reader = run_training_loop() test_data = dataset_reader.read(TEST_PATH) test_data.index_with(model.vocab) data_loader = PyTorchDataLoader(test_data, batch_size=batch_size, shuffle=False) results = evaluate(model, data_loader, cuda_device=0) print(results) print("batch_size:{}, num_epoch:{}, lr:{}, grad_accum:{}".format( batch_size, num_epoch, lr, grad_accum))
def main(): opts = options() # select a bert specific indexer if opts.with_bert: from allennlp.data.token_indexers.pretrained_transformer_mismatched_indexer import PretrainedTransformerMismatchedIndexer indexer = PretrainedTransformerMismatchedIndexer( model_name=opts.bert_name, max_length=opts.bert_max_len) # separate by spaces else: from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer indexer = SingleIdTokenIndexer() reader = TaggerDatasetReader(token_indexers={"tokens": indexer}) train_dataset = reader.read(opts.train_file) valid_dataset = reader.read(opts.valid_file) params = Tagger.opts2params(opts) with open(opts.model_dir + "/params.pkl", mode='wb') as f: pickle.dump(params, f) vocab = Vocabulary.from_instances(train_dataset + valid_dataset, min_count={'tokens': opts.min_freq}) train_dataset.index_with(vocab) valid_dataset.index_with(vocab) train_data_loader = PyTorchDataLoader(train_dataset, batch_sampler=BucketBatchSampler( train_dataset, batch_size=opts.batch_size, sorting_keys=["tokens"])) valid_data_loader = PyTorchDataLoader(valid_dataset, batch_sampler=BucketBatchSampler( valid_dataset, batch_size=opts.batch_size, sorting_keys=["tokens"])) model = Tagger.build(params, vocab) if torch.cuda.is_available(): cuda_device = opts.gpuid model = model.cuda(cuda_device) else: cuda_device = -1 # select an optimizer for fine-tuning if opts.with_bert: from allennlp.training.optimizers import HuggingfaceAdamWOptimizer parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = HuggingfaceAdamWOptimizer(model_parameters=parameters, lr=0.0003, parameter_groups=[ ([".*transformer.*"], { "lr": 1e-05 }) ]) # optimizer for random initialization else: import torch.optim as optim optimizer = optim.Adam(model.parameters(), lr=0.001) trainer = GradientDescentTrainer( model=model, optimizer=optimizer, data_loader=train_data_loader, validation_data_loader=valid_data_loader, num_epochs=1, use_amp=opts.use_amp, num_gradient_accumulation_steps=opts.num_gradient_accumulation_steps, cuda_device=cuda_device) vocab.save_to_files(opts.model_dir + "/vocab") best_f1 = 0.0 for i in range(opts.epochs): epoch = i + 1 print('Epoch: {}'.format(epoch)) info = trainer.train() print(info) if info["validation_accuracy"] > best_f1: best_f1 = info["validation_accuracy"] with open(opts.model_dir + "/save_" + str(epoch) + ".save", 'wb') as f_model: torch.save(model.state_dict(), f_model)
batch_size = 2 embedding_dim = 200 num_epoch = 75 lr = 0.0001 num_labels = 2 grad_accum = 16 weight_decay = 0.0001 validation_metric = "+f1-measure-overall" num_serialized_models_to_keep = 3 grad_norm = 5.0 patience = 25 import datetime now = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now()) serialization_dir = f"{cur_dir}/checkpoints_ner/lr_" + str( lr) + "_" + now + "_seed" + str(seed) + "_" + ("single" if not args.pseudo else "pseudo") vocab_dir = serialization_dir + "/vocab" model, dataset_reader = run_training_loop() test_data = dataset_reader.read(TEST_PATH) test_data.index_with(model.vocab) data_loader = PyTorchDataLoader(test_data, batch_size=batch_size) results = evaluate(model, data_loader, cuda_device=0) print(results) print("batch_size:{}, num_epoch:{}, lr:{}, grad_accum:{}".format( batch_size, num_epoch, lr, grad_accum))
def train(train, validation, optimizer_name): batch_size = 32 learning_rate = 0.01 max_iterations = 100 token_indexer = { "tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer(min_padding_length=3), } reader = Conll2003DatasetReader(token_indexer) train_dataset = reader.read(train) validation_dataset = reader.read(validation) # Once we've read in the datasets, we use them to create our <code>Vocabulary</code> # (that is, the mapping[s] from tokens / labels to ids). vocab = Vocabulary.from_instances(train_dataset + validation_dataset) # Set variables model = get_model(vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 if optimizer_name == 'adahessian': optimizer = Adahessian(model.parameters(), lr=learning_rate, block_length=2) elif optimizer_name == 'ranger': optimizer = Ranger(model.parameters(), lr=learning_rate) else: raise AttributeError() train_dataset.index_with(vocab) validation_dataset.index_with(vocab) scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, factor=0.5, patience=4, mode="min", verbose=True) dl = PyTorchDataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True, ) dl_validation = PyTorchDataLoader(validation_dataset, batch_size=batch_size, shuffle=False) trainer_model = AdaTrainer trainer = trainer_model( model=model, optimizer=optimizer, # iterator=iterator, grad_norm=10.0, data_loader=dl, validation_data_loader=dl_validation, learning_rate_scheduler=scheduler, patience=8, num_epochs=max_iterations, cuda_device=cuda_device, ) train_metrics = trainer.train() print(train_metrics)