Exemple #1
0
def main():
    reader = LinzenDatasetReader(append_null=False)
    train_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.train")
    validation_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.val")
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    model = StackRNNAgreementPredictor(vocab,
                                       rnn_dim=100,
                                       rnn_cell_type=torch.nn.GRUCell)
    # model = SimpleRNNAgreementPredictor(
    #     vocab, rnn_dim=18, rnn_type=torch.nn.GRU)

    optimizer = torch.optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=16,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=5)
    trainer.train()

    with open("/tmp/model.th", "wb") as fh:
        torch.save(model.state_dict(), fh)
    vocab.save_to_files("/tmp/vocabulary")
Exemple #2
0
 def test_trainer_can_run(self):
     trainer = Trainer(self.model,
                       self.optimizer,
                       self.iterator,
                       self.dataset,
                       num_epochs=2)
     trainer.train()
Exemple #3
0
    def test_trainer_can_run_multiple_gpu(self):

        class MetaDataCheckWrapper(Model):
            """
            Checks that the metadata field has been correctly split across the batch dimension
            when running on multiple gpus.
            """
            def __init__(self, model):
                super().__init__(model.vocab)
                self.model = model

            def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore # pylint: disable=arguments-differ
                assert 'metadata' in kwargs and 'tags' in kwargs, \
                    f'tokens and metadata must be provided. Got {kwargs.keys()} instead.'
                batch_size = kwargs['tokens']['tokens'].size()[0]
                assert len(kwargs['metadata']) == batch_size, \
                    f'metadata must be split appropriately. Expected {batch_size} elements, ' \
                    f"got {len(kwargs['metadata'])} elements."
                return self.model.forward(**kwargs)

        multigpu_iterator = BasicIterator(batch_size=4)
        multigpu_iterator.index_with(self.vocab)
        trainer = Trainer(MetaDataCheckWrapper(self.model), self.optimizer,
                          multigpu_iterator, self.instances, num_epochs=2,
                          cuda_device=[0, 1])
        trainer.train()
Exemple #4
0
 def train_and_save():
     model = LstmTagger(word_embeddings, lstm, vocab)
     if cuda_device >= 0:
         model = model.cuda(cuda_device)
     optimizer = optim.SGD(model.parameters(), lr=0.1)
     iterator = BucketIterator(batch_size=2,
                               sorting_keys=[("sentence", "num_tokens")])
     iterator.index_with(vocab)
     trainer = Trainer(model=model,
                       optimizer=optimizer,
                       iterator=iterator,
                       train_dataset=train_dataset,
                       validation_dataset=validation_dataset,
                       patience=10,
                       num_epochs=500,
                       cuda_device=cuda_device)
     trainer.train()
     predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
     tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
     tag_ids = np.argmax(tag_logits, axis=-1)
     print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])
     # Here's how to save the model.
     with open("./tmp/model.th", 'wb') as f:
         torch.save(model.state_dict(), f)
     vocab.save_to_files("./tmp/vocabulary")
     return tag_logits
Exemple #5
0
def train_detector(args, detector, vocab, trapdoor_train, trapdoor_dev=None):
    iterator = BucketIterator(batch_size=args.detector_batch_size,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  detector.parameters()),
                           lr=args.lr_detector)

    train_counts = np.unique([a['label'].label for a in trapdoor_train],
                             return_counts=True)[1]
    dev_counts = np.unique([a['label'].label for a in trapdoor_dev],
                           return_counts=True)[1]
    print("Distribution of detector train", train_counts)
    print("Distribution of detector dev", dev_counts)

    class_weight = torch.from_numpy(np.max(train_counts) /
                                    train_counts).cuda().float()
    detector.set_class_weight(class_weight)

    trainer = Trainer(model=detector,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=trapdoor_train,
                      validation_dataset=trapdoor_dev,
                      num_epochs=args.detector_epochs,
                      patience=args.detector_patience,
                      cuda_device=0)
    trainer.train()
Exemple #6
0
def train_model(args,
                model,
                vocab,
                train_data,
                dev_data=None,
                epochs=None,
                weight_balance=True):
    if weight_balance:
        train_counts = np.unique([a['label'].label for a in train_data],
                                 return_counts=True)[1]
        class_weight = torch.from_numpy(np.max(train_counts) /
                                        train_counts).cuda().float()
        model.set_class_weight(class_weight)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      num_epochs=epochs if epochs else args.epochs,
                      patience=args.patience,
                      cuda_device=0)
    trainer.train()
Exemple #7
0
def main():
    reader = MarvinLinzenLMDatasetReader(append_null=False)
    train_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.train")
    validation_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.val")
    vocab = Vocabulary.from_files("saved_models/vocabulary_brown")

    model = StackRNNLanguageModel(vocab,
                                  rnn_dim=100,
                                  rnn_cell_type=torch.nn.GRUCell)
    model.load_state_dict(torch.load("saved_models/stack-brown.th"))

    optimizer = torch.optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=16,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=5)
    trainer.train()

    with open("/tmp/model.th", "wb") as fh:
        torch.save(model.state_dict(), fh)
    vocab.save_to_files("/tmp/vocabulary")
Exemple #8
0
def running_NER():
    reader = PosDatasetReader()
    train_dataset = reader.read('../data/700_multi_data/600_ner_train.txt')
    validation_dataset = reader.read('../data/700_multi_data/66_ner_test.txt')

    vocab = Vocabulary.from_files("../model_store/vocabulary")

    # '''vocab part'''
    # train_1 = reader.read('../data/train/train.json')
    # train_2 = reader.read('../data/train/dev.json')

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=1000)
    trainer.train()
Exemple #9
0
def running_whole_model():
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("sentence", "num_tokens"),
                                                                   ("structures1", "num_tokens"),
                                                                   ("structures2", "num_tokens"),
                                                                   ("structures3", "num_tokens")])
    iterator.index_with(vocab)


    model = All_generating(embed_size=EMBEDDING_DIM,
                           word_embeddings=word_embeddings,
                           vocab=vocab,
                           num_of_candidates=7,
                           )

    # optimizer = adabound.AdaBound(model.parameters(), lr=lr, final_lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=lr)


    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=whole_train_dataset,
                      validation_dataset=whole_validation_dataset,
                      patience=5,
                      num_epochs=30)
    trainer.train()
Exemple #10
0
    def test_trainer_can_run(self):
        trainer = Trainer(model=self.model,
                          optimizer=self.optimizer,
                          iterator=self.iterator,
                          train_dataset=self.instances,
                          validation_dataset=self.instances,
                          num_epochs=2)
        metrics = trainer.train()
        assert 'best_validation_loss' in metrics
        assert isinstance(metrics['best_validation_loss'], float)
        assert 'best_validation_accuracy' in metrics
        assert isinstance(metrics['best_validation_accuracy'], float)
        assert 'best_validation_accuracy3' in metrics
        assert isinstance(metrics['best_validation_accuracy3'], float)
        assert 'best_epoch' in metrics
        assert isinstance(metrics['best_epoch'], int)

        # Making sure that both increasing and decreasing validation metrics work.
        trainer = Trainer(model=self.model,
                          optimizer=self.optimizer,
                          iterator=self.iterator,
                          train_dataset=self.instances,
                          validation_dataset=self.instances,
                          validation_metric='+loss',
                          num_epochs=2)
        metrics = trainer.train()
        assert 'best_validation_loss' in metrics
        assert isinstance(metrics['best_validation_loss'], float)
        assert 'best_validation_accuracy' in metrics
        assert isinstance(metrics['best_validation_accuracy'], float)
        assert 'best_validation_accuracy3' in metrics
        assert isinstance(metrics['best_validation_accuracy3'], float)
        assert 'best_epoch' in metrics
        assert isinstance(metrics['best_epoch'], int)
Exemple #11
0
 def test_configuration_error_when_passed_as_conflicting_argument_to_trainer(self):
     """
     Users should initialize Trainer either with an instance of Checkpointer or by specifying
     parameter values for num_serialized_models_to_keep and keep_serialized_model_every_num_seconds.
     Check that Trainer raises a ConfigurationError if both methods are used at the same time.
     """
     with self.assertRaises(ConfigurationError):
         Trainer(None, None, None, None,
                 num_serialized_models_to_keep=30,
                 keep_serialized_model_every_num_seconds=None,
                 checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                           num_serialized_models_to_keep=40,
                                           keep_serialized_model_every_num_seconds=2))
     with self.assertRaises(ConfigurationError):
         Trainer(None, None, None, None,
                 num_serialized_models_to_keep=20,
                 keep_serialized_model_every_num_seconds=2,
                 checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                           num_serialized_models_to_keep=40,
                                           keep_serialized_model_every_num_seconds=2))
     try:
         Trainer(None, None, None, None,
                 checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                           num_serialized_models_to_keep=40,
                                           keep_serialized_model_every_num_seconds=2))
     except ConfigurationError:
         self.fail("Configuration Error raised for passed checkpointer")
Exemple #12
0
def trainModel(train_dataset, validation_dataset, vocab):
    EMBEDDING_DIM = 6
    HIDDEN_DIM = 6
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, bidirectional=False, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    # optimizer = optim.AdamW(model.parameters(), lr=1e-4, eps=1e-8)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=100,
                      cuda_device=cuda_device)
    trainer.train()
    return model
def main():
    reader = TatoebaSentenceReader()
    train_set = reader.read('data/mt/sentences.top10langs.train.tsv')
    dev_set = reader.read('data/mt/sentences.top10langs.dev.tsv')

    vocab = Vocabulary.from_instances(train_set,
                                      min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    positive_label = vocab.get_token_index('eng', namespace='labels')
    model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_set,
                      validation_dataset=dev_set,
                      num_epochs=3)

    trainer.train()
Exemple #14
0
 def test_trainer_can_run_multiple_gpu(self):
     multigpu_iterator = BasicIterator(batch_size=4)
     multigpu_iterator.index_with(self.vocab)
     trainer = Trainer(self.model, self.optimizer,
                       multigpu_iterator, self.instances, num_epochs=2,
                       cuda_device=[0, 1])
     trainer.train()
    def test_trainer_respects_keep_serialized_model_every_num_seconds(self):
        # To test:
        #   Create an iterator that sleeps for 0.5 second per epoch, so the total training
        #       time for one epoch is slightly greater then 0.5 seconds.
        #   Run for 6 epochs, keeping the last 2 models, models also kept every 1 second.
        #   Check the resulting checkpoints.  Should then have models at epochs
        #       2, 4, plus the last two at 5 and 6.
        class WaitingIterator(BasicIterator):
            # pylint: disable=arguments-differ
            def _create_batches(self, *args, **kwargs):
                time.sleep(0.5)
                return super(WaitingIterator, self)._create_batches(*args, **kwargs)

        iterator = WaitingIterator(batch_size=2)
        iterator.index_with(self.vocab)

        trainer = Trainer(self.model, self.optimizer,
                          iterator, self.instances, num_epochs=6,
                          serialization_dir=self.TEST_DIR,
                          num_serialized_models_to_keep=2,
                          keep_serialized_model_every_num_seconds=1)
        trainer.train()

        # Now check the serialized files
        for prefix in ['model_state_epoch_*', 'training_state_epoch_*']:
            file_names = glob.glob(os.path.join(self.TEST_DIR, prefix))
            epochs = [int(re.search(r"_([0-9])\.th", fname).group(1))
                      for fname in file_names]
            # epoch N has N-1 in file name
            assert sorted(epochs) == [1, 3, 4, 5]
Exemple #16
0
def main():
    reader = LanguageModelingReader()
    train_dataset = reader.read('data/mt/sentences.eng.10k.txt')

    # for inst in train_dataset:
    #     print(inst)

    vocab = Vocabulary.from_instances(train_dataset, min_count={'tokens': 5})

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("input_tokens", "num_tokens")])

    iterator.index_with(vocab)

    model = RNNLanguageModel(vocab, cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      patience=10,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)

    trainer.train()

    print(model.generate())
    print(model.generate())
    print(model.generate())
    print(model.generate())
    print(model.generate())
 def test_trainer_can_run_multiple_gpu(self):
     multigpu_iterator = BasicIterator(batch_size=4)
     multigpu_iterator.index_with(self.vocab)
     trainer = Trainer(self.model, self.optimizer,
                       multigpu_iterator, self.instances, num_epochs=2,
                       cuda_device=[0, 1])
     trainer.train()
Exemple #18
0
    def test_trainer_respects_keep_serialized_model_every_num_seconds(self):
        # To test:
        #   Create an iterator that sleeps for 2.5 second per epoch, so the total training
        #       time for one epoch is slightly greater then 2.5 seconds.
        #   Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds.
        #   Check the resulting checkpoints.  Should then have models at epochs
        #       2, 4, plus the last two at 5 and 6.
        class WaitingIterator(BasicIterator):
            # pylint: disable=arguments-differ
            def _create_batches(self, *args, **kwargs):
                time.sleep(2.5)
                return super(WaitingIterator, self)._create_batches(*args, **kwargs)

        iterator = WaitingIterator(batch_size=2)
        iterator.index_with(self.vocab)

        trainer = Trainer(self.model, self.optimizer,
                          iterator, self.instances, num_epochs=6,
                          serialization_dir=self.TEST_DIR,
                          num_serialized_models_to_keep=2,
                          keep_serialized_model_every_num_seconds=5)
        trainer.train()

        # Now check the serialized files
        for prefix in ['model_state_epoch_*', 'training_state_epoch_*']:
            file_names = glob.glob(os.path.join(self.TEST_DIR, prefix))
            epochs = [int(re.search(r"_([0-9])\.th", fname).group(1))
                      for fname in file_names]
            # epoch N has N-1 in file name
            assert sorted(epochs) == [1, 3, 4, 5]
Exemple #19
0
def train_model(parameters, name):
    token_indexer = {
        "tokens": ELMoTokenCharactersIndexer()
    } if parameters['use_elmo'] else None
    reader = SSJ500KReader(
        token_indexer) if parameters["dataset"] == "ssj" else SentiCorefReader(
            token_indexer)
    train_dataset = reader.read("train")
    validation_dataset = reader.read("test")
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
    # vocab = Vocabulary() if parameters['use_elmo'] else Vocabulary.from_instances(train_dataset + validation_dataset)
    model = get_model(vocab, parameters)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    optimizer = optim.Adam(model.parameters(),
                           lr=parameters['lr'],
                           weight_decay=parameters['weight_decay'])
    iterator = BucketIterator(batch_size=parameters['batch_size'],
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=parameters['patience'],
                      num_epochs=parameters['num_epochs'],
                      cuda_device=cuda_device)
    trainer.train()
    metrics = evaluate(model, validation_dataset, iterator, cuda_device, None)
    save_model_and_vocab(model, vocab, metrics, parameters, fname=name)
Exemple #20
0
def train(
    model: Model,
    binary_class: str,
    train_data: DatasetType,
    valid_reader: DatasetReader,
    vocab: Vocabulary,
    optimizer_type: str,
    optimizer_learning_rate: float,
    optimizer_weight_decay: float,
    batch_size: int,
    patience: int,
    num_epochs: int,
    device: str,
) -> Tuple[Model, MetricsType]:
    train_reader = BIODatasetReader(
        ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class),
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
        },
    )

    train_dataset = train_reader.read('tmp.txt')
    valid_dataset = valid_reader.read('tmp.txt')

    cuda_device = -1

    if device == 'cuda':
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(
        model.parameters(),
        lr=optimizer_learning_rate,
        weight_decay=optimizer_weight_decay,
    )

    iterator = BucketIterator(
        batch_size=batch_size,
        sorting_keys=[("sentence", "num_tokens")],
    )

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        patience=patience,
        num_epochs=num_epochs,
        cuda_device=cuda_device,
        validation_metric='f1-measure-overall',
    )
    metrics = trainer.train()

    return model, metrics
 def test_should_stop_early_with_decreasing_metric(self):
     new_trainer = Trainer(self.model, self.optimizer,
                           self.iterator, self.instances,
                           validation_dataset=self.instances,
                           num_epochs=3, serialization_dir=self.TEST_DIR,
                           patience=5, validation_metric="-test")
     assert new_trainer._should_stop_early([.02, .3, .2, .1, .4, .4])  # pylint: disable=protected-access
     assert not new_trainer._should_stop_early([.3, .3, .2, .1, .4, .5])  # pylint: disable=protected-access
Exemple #22
0
 def test_should_stop_early_with_increasing_metric(self):
     new_trainer = Trainer(self.model, self.optimizer,
                           self.iterator, self.instances,
                           validation_dataset=self.instances,
                           num_epochs=3, serialization_dir=self.TEST_DIR,
                           patience=5, validation_metric="+test")
     assert new_trainer._should_stop_early([.5, .3, .2, .1, .4, .4])  # pylint: disable=protected-access
     assert not new_trainer._should_stop_early([.3, .3, .3, .2, .5, .1])  # pylint: disable=protected-access
Exemple #23
0
 def test_trainer_can_run_cuda(self):
     trainer = Trainer(self.model,
                       self.optimizer,
                       self.iterator,
                       self.instances,
                       num_epochs=2,
                       cuda_device=0)
     trainer.train()
Exemple #24
0
 def test_trainer_can_run_multiple_gpu(self):
     trainer = Trainer(self.model,
                       self.optimizer,
                       BasicIterator(batch_size=4),
                       self.instances,
                       num_epochs=2,
                       cuda_device=[0, 1])
     trainer.train()
Exemple #25
0
def train(model_args):
    model_name = model_args.serialization_dir
    checkpoint_dir = model_args.store_folder
    learning_rate = model_args.learning_rate
    rl_basic = model_args.rl_basic
    pretrain_folder = ''

    if checkpoint_dir == 'pretrain':
        is_pretrain = True
    else:
        # check if rl_basic is specified
        pretrain_folder = os.path.join('pretrain', rl_basic)
        if not os.path.exists(pretrain_folder):
            raise FileNotFoundError(f'Can not find the pretrained model {pretrain_folder}!')
        is_pretrain = False

    reader = construct_reader(is_pretrain=is_pretrain)

    train_dataset = reader.read("data_processed\\train.jsonl")
    test_dataset = reader.read("data_processed\\test.jsonl")

    # build vocabulary
    vocab = Vocabulary.from_instances(train_dataset + test_dataset)

    # build model and move it into cuda
    model = construct_model(vocab, model_args)
    model.cuda()

    # allocate
    optimizer = optim.Adam(model.parameters(), weight_decay=1e-5, lr=learning_rate)
    scheduler = construct_learning_scheduler(optimizer)

    iterator = BucketIterator(batch_size=2, sorting_keys=[("prev_tokens", "num_tokens")])
    iterator.index_with(vocab)

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # not recover from previous state, we should load the pretrain model as default.
    if not is_pretrain and not os.path.exists(os.path.join(checkpoint_dir, model_name, "best.th")):
        model_state = torch.load(os.path.join(pretrain_folder, "best.th"))
        model.load_state_dict(model_state)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=test_dataset,
                      learning_rate_scheduler=scheduler,
                      patience=model_args.patience,
                      validation_metric="+{}".format(model_args.validation_metric),
                      num_epochs=model_args.epoch,
                      serialization_dir=os.path.join(checkpoint_dir, model_name),
                      cuda_device=0,
                      should_log_learning_rate=True)

    trainer.train()
    return model_name
Exemple #26
0
 def test_trainer_raises_on_model_with_no_loss_key(self):
     class FakeModel(torch.nn.Module):
         def forward(self, **kwargs):  # pylint: disable=arguments-differ,unused-argument
             return {}
     with pytest.raises(RuntimeError):
         trainer = Trainer(FakeModel(), self.optimizer,
                           self.iterator, self.instances,
                           num_epochs=2, serialization_dir=self.TEST_DIR)
         trainer.train()
Exemple #27
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()
Exemple #28
0
 def test_trainer_raises_on_model_with_no_loss_key(self):
     class FakeModel(torch.nn.Module):
         def forward(self, **kwargs):  # pylint: disable=arguments-differ,unused-argument
             return {}
     with pytest.raises(RuntimeError):
         trainer = Trainer(FakeModel(), self.optimizer,
                           self.iterator, self.instances,
                           num_epochs=2, serialization_dir=self.TEST_DIR)
         trainer.train()
def main():
    reader = UniversalDependenciesDatasetReader()
    train_dataset = reader.read(
        'data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-train.conllu')
    dev_dataset = reader.read(
        'data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-dev.conllu')

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_SIZE)

    lstm = torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True)

    inner_model = LstmTaggerInnerModel(encoder=lstm,
                                       embedding=token_embedding,
                                       encoder_output_size=HIDDEN_SIZE,
                                       label_size=vocab.get_vocab_size('pos'))
    model = LstmTagger(inner_model, vocab)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=16,
                              sorting_keys=[("words", "num_tokens")],
                              padding_noise=0.)

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=10)
    trainer.train()

    # Run predictor for a sample sentence
    predictor = UniversalPOSPredictor(model, reader)
    logits = predictor.predict(['Time', 'flies', 'like', 'an', 'arrow',
                                '.'])['tag_logits']
    tag_ids = np.argmax(logits, axis=-1)

    print([vocab.get_token_from_index(tag_id, 'pos') for tag_id in tag_ids])

    # Export the inner_model as the ONNX format
    out_dir = 'examples/pos'
    dummy_input = torch.zeros(1, MAX_LEN, dtype=torch.long)
    dummy_mask = torch.ones(1, MAX_LEN, dtype=torch.long)
    inner_model.exporting = True
    torch.onnx.export(model=inner_model,
                      args=(dummy_input, dummy_mask),
                      f=f'{out_dir}/model.onnx',
                      verbose=True)

    vocab.save_to_files(f'{out_dir}/vocab')
Exemple #30
0
    def test_trainer_can_log_histograms(self):
        # enable activation logging
        for module in self.model.modules():
            module.should_log_activations = True

        trainer = Trainer(self.model, self.optimizer,
                          self.iterator, self.instances, num_epochs=3,
                          serialization_dir=self.TEST_DIR,
                          histogram_interval=2)
        trainer.train()
    def test_trainer_can_log_histograms(self):
        # enable activation logging
        for module in self.model.modules():
            module.should_log_activations = True

        trainer = Trainer(self.model, self.optimizer,
                          self.iterator, self.instances, num_epochs=3,
                          serialization_dir=self.TEST_DIR,
                          histogram_interval=2)
        trainer.train()
Exemple #32
0
    def test_trainer_can_resume_with_lr_scheduler(self):
        # pylint: disable=protected-access
        lr_scheduler = LearningRateScheduler.from_params(
                self.optimizer, Params({"type": "exponential", "gamma": 0.5}))
        trainer = Trainer(model=self.model,
                          optimizer=self.optimizer,
                          iterator=self.iterator,
                          learning_rate_scheduler=lr_scheduler,
                          train_dataset=self.instances,
                          validation_dataset=self.instances,
                          num_epochs=2, serialization_dir=self.TEST_DIR)
        trainer.train()

        new_lr_scheduler = LearningRateScheduler.from_params(
                self.optimizer, Params({"type": "exponential", "gamma": 0.5}))
        new_trainer = Trainer(model=self.model,
                              optimizer=self.optimizer,
                              iterator=self.iterator,
                              learning_rate_scheduler=new_lr_scheduler,
                              train_dataset=self.instances,
                              validation_dataset=self.instances,
                              num_epochs=4, serialization_dir=self.TEST_DIR)
        epoch, _ = new_trainer._restore_checkpoint()
        assert epoch == 2
        assert new_trainer._learning_rate_scheduler.lr_scheduler.last_epoch == 1
        new_trainer.train()
Exemple #33
0
def main():
    # "http://mattmahoney.net/dc/text8.zip" download first
    data_dir = 'data/word2vec/text8/text8'

    # 1. build vocab from file
    vocab = build_vocab(data_dir)

    # 2. build reader
    reader = SimpleSkipGramReader(
        window_size=WIN_SIZE)  # or SkipGramReader(vocab=vocab)
    text8 = reader.read(data_dir)

    embedding_in = Embedding(
        num_embeddings=vocab.get_vocab_size('token_target'),
        embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(
        num_embeddings=vocab.get_vocab_size('token_context'),
        embedding_dim=EMBEDDING_DIM)

    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)

    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)  # important, transform token to index

    model = SkipGramNegativeSamplingModel(vocab,
                                          embedding_in,
                                          embedding_out,
                                          neg_samples=10,
                                          cuda_device=CUDA_DEVICE)
    #
    # model = SkipGramModel(vocab=vocab,
    #                       embedding_in=embedding_in,
    #                       cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=text8,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab)
    print(get_synonyms('one', embedding_in, vocab))
    print(get_synonyms('december', embedding_in, vocab))
    print(get_synonyms('flower', embedding_in, vocab))
    print(get_synonyms('design', embedding_in, vocab))
    print(get_synonyms('snow', embedding_in, vocab))

    rho = evaluate_embeddings(embedding_in, vocab)
    print('simlex999 speareman correlation: {}'.format(rho))
Exemple #34
0
def train(model_dir):

    # prepare data
    #reader = CoqaDatasetReader()
    #reader = CoqaDatasetReader(tokenizer=lambda x: WordTokenizer().tokenize(text=x))
    reader = CoqaDatasetReader(tokenizer=lambda sent: SpacyWordSplitter(
        language='en_core_web_sm').split_words(sent))
    train_dataset = reader.read(
        cached_path('/mnt/DATA/ML/data/corpora/QA/CoQA/coqa-train-v1.0.json'))
    validation_dataset = reader.read(
        cached_path('/mnt/DATA/ML/data/corpora/QA/CoQA/coqa-dev-v1.0.json'))

    vocab = None
    model_fn = os.path.join(model_dir, 'model.th')
    vocab_fn = os.path.join(model_dir, 'vocab')
    if os.path.exists(model_dir):
        if os.path.exists(vocab_fn):
            logging.info('load vocab from: %s...' % vocab_fn)
            vocab = Vocabulary.from_files(vocab_fn)
    else:
        os.makedirs(model_dir)
    if vocab is None:
        vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        logging.info('save vocab to: %s...' % vocab_fn)
        vocab.save_to_files(vocab_fn)
    logging.info('data prepared')

    model = create_model(vocab)

    if os.path.exists(model_fn):
        logging.info('load model wheights from: %s...' % model_fn)
        with open(model_fn, 'rb') as f:
            model.load_state_dict(torch.load(f))
    logging.info('model prepared')

    # prepare training
    # optimizer = optim.SGD(model.parameters(), lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    iterator = BasicIterator(batch_size=2)
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=100)
    logging.info('training prepared')

    trainer.train()

    logging.info('save model to: %s...' % model_fn)
    with open(model_fn, 'wb') as f:
        torch.save(model.state_dict(), f)
def train_only_swag():
    # load datasetreader 
    # Save logging to a local file
    # Multitasking
    log.getLogger().addHandler(log.FileHandler(directory+"/log.log"))

    lr = 0.00001
    batch_size = 2
    epochs = 100
    max_seq_len = 512
    max_span_width = 30
    #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,)
    token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False)
    swag_reader = SWAGDatasetReader(tokenizer=token_indexer.wordpiece_tokenizer,lazy=True, token_indexers=token_indexer)
    EMBEDDING_DIM = 1024
    HIDDEN_DIM = 200
    swag_datasets = load_swag(swag_reader, directory)
    swag_vocab = Vocabulary()

    swag_vocab = Vocabulary()
    swag_iterator = BasicIterator(batch_size=batch_size)
    swag_iterator.index_with(swag_vocab)

    from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

    bert_embedder = PretrainedBertEmbedder(pretrained_model="bert-base-cased",top_layer_only=True, requires_grad=True)

    word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True)
    BERT_DIM = word_embedding.get_output_dim()
    seq2vec = PytorchSeq2VecWrapper(torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True))
    mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())
    antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())

    model = SWAGExampleModel(vocab=swag_vocab, text_field_embedder=word_embedding, phrase_encoder=seq2vec)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    USE_GPU =1 
    val_iterator = swag_iterator(swag_datasets[1], num_epochs=1, shuffle=True)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=swag_iterator,
        validation_iterator = swag_iterator, 
        train_dataset=swag_datasets[0],
        validation_dataset = swag_datasets[1], 
        validation_metric = "+accuracy",
        cuda_device=0 if USE_GPU else -1,
        serialization_dir= directory + "saved_models/current_run_model_state_swag",
        num_epochs=epochs,
    )    

    metrics = trainer.train()
    # save the model
    with open(directory + "saved_models/current_run_model_state", 'wb') as f:
        torch.save(model.state_dict(), f)
Exemple #36
0
    def test_trainer_can_log_learning_rates_tensorboard(self):
        iterator = BasicIterator(batch_size=4)
        iterator.index_with(self.vocab)

        trainer = Trainer(self.model, self.optimizer,
                          iterator, self.instances, num_epochs=2,
                          serialization_dir=self.TEST_DIR,
                          should_log_learning_rate=True,
                          summary_interval=2)

        trainer.train()
Exemple #37
0
 def test_trainer_can_run_with_lr_scheduler(self):
     lr_params = Params({"type": "reduce_on_plateau"})
     lr_scheduler = LearningRateScheduler.from_params(self.optimizer, lr_params)
     trainer = Trainer(model=self.model,
                       optimizer=self.optimizer,
                       iterator=self.iterator,
                       learning_rate_scheduler=lr_scheduler,
                       validation_metric="-loss",
                       train_dataset=self.instances,
                       validation_dataset=self.instances,
                       num_epochs=2)
     trainer.train()
Exemple #38
0
    def test_trainer_respects_num_serialized_models_to_keep(self):
        trainer = Trainer(self.model, self.optimizer,
                          self.iterator, self.instances, num_epochs=5,
                          serialization_dir=self.TEST_DIR,
                          num_serialized_models_to_keep=3)
        trainer.train()

        # Now check the serialized files
        for prefix in ['model_state_epoch_*', 'training_state_epoch_*']:
            file_names = glob.glob(os.path.join(self.TEST_DIR, prefix))
            epochs = [int(re.search(r"_([0-9])\.th", fname).group(1))
                      for fname in file_names]
            assert sorted(epochs) == [2, 3, 4]
Exemple #39
0
 def test_metric_only_considered_best_so_far_when_strictly_better_than_those_before_it_decreasing_metric(self):
     new_trainer = Trainer(self.model, self.optimizer,
                           self.iterator, self.instances,
                           validation_dataset=self.instances,
                           num_epochs=3, serialization_dir=self.TEST_DIR,
                           patience=5, validation_metric="-test")
     # when it is the only metric it should be considered the best
     assert new_trainer._is_best_so_far(1, [])  # pylint: disable=protected-access
     # when it is the same as one before it it is not considered the best
     assert not new_trainer._is_best_so_far(.3, [.3, .3, .3, .2, .5, .1])  # pylint: disable=protected-access
     # when it is the best it is considered the best
     assert new_trainer._is_best_so_far(.013, [.3, .3, .3, .2, .5, .1])  # pylint: disable=protected-access
     # when it is not the the best it is not considered the best
     assert not new_trainer._is_best_so_far(13.00, [.3, .3, .3, .2, .5, .1])  # pylint: disable=protected-access
Exemple #40
0
    def test_should_stop_early_with_early_stopping_disabled(self):
        # Increasing metric
        trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances,
                          validation_dataset=self.instances, num_epochs=100,
                          patience=None, validation_metric="+test")
        decreasing_history = [float(i) for i in reversed(range(20))]
        assert not trainer._should_stop_early(decreasing_history)  # pylint: disable=protected-access

        # Decreasing metric
        trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances,
                          validation_dataset=self.instances, num_epochs=100,
                          patience=None, validation_metric="-test")
        increasing_history = [float(i) for i in range(20)]
        assert not trainer._should_stop_early(increasing_history)  # pylint: disable=protected-access
Exemple #41
0
    def test_trainer_saves_metrics_every_epoch(self):
        trainer = Trainer(model=self.model,
                          optimizer=self.optimizer,
                          iterator=self.iterator,
                          train_dataset=self.instances,
                          validation_dataset=self.instances,
                          num_epochs=5,
                          serialization_dir=self.TEST_DIR,
                          num_serialized_models_to_keep=3)
        trainer.train()

        for epoch in range(5):
            epoch_file = self.TEST_DIR / f'metrics_epoch_{epoch}.json'
            assert epoch_file.exists()
            metrics = json.load(open(epoch_file))
            assert "validation_loss" in metrics
            assert "best_validation_loss" in metrics
            assert metrics.get("epoch") == epoch
Exemple #42
0
    def test_trainer_saves_models_at_specified_interval(self):
        iterator = BasicIterator(batch_size=4)
        iterator.index_with(self.vocab)

        trainer = Trainer(self.model, self.optimizer,
                          iterator, self.instances, num_epochs=2,
                          serialization_dir=self.TEST_DIR,
                          model_save_interval=0.0001)

        trainer.train()

        # Now check the serialized files for models saved during the epoch.
        prefix = 'model_state_epoch_*'
        file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix)))
        epochs = [re.search(r"_([0-9\.\-]+)\.th", fname).group(1)
                  for fname in file_names]
        # We should have checkpoints at the end of each epoch and during each, e.g.
        # [0.timestamp, 0, 1.timestamp, 1]
        assert len(epochs) == 4
        assert epochs[3] == '1'
        assert '.' in epochs[0]

        # Now make certain we can restore from timestamped checkpoint.
        # To do so, remove the checkpoint from the end of epoch 1&2, so
        # that we are forced to restore from the timestamped checkpoints.
        for k in range(2):
            os.remove(os.path.join(self.TEST_DIR, 'model_state_epoch_{}.th'.format(k)))
            os.remove(os.path.join(self.TEST_DIR, 'training_state_epoch_{}.th'.format(k)))
        os.remove(os.path.join(self.TEST_DIR, 'best.th'))

        restore_trainer = Trainer(self.model, self.optimizer,
                                  self.iterator, self.instances, num_epochs=2,
                                  serialization_dir=self.TEST_DIR,
                                  model_save_interval=0.0001)
        epoch, _ = restore_trainer._restore_checkpoint()  # pylint: disable=protected-access
        assert epoch == 2
        # One batch per epoch.
        assert restore_trainer._batch_num_total == 2  # pylint: disable=protected-access
Exemple #43
0
    def test_trainer_can_resume_training(self):
        trainer = Trainer(self.model, self.optimizer,
                          self.iterator, self.instances,
                          validation_dataset=self.instances,
                          num_epochs=1, serialization_dir=self.TEST_DIR)
        trainer.train()
        new_trainer = Trainer(self.model, self.optimizer,
                              self.iterator, self.instances,
                              validation_dataset=self.instances,
                              num_epochs=3, serialization_dir=self.TEST_DIR)

        epoch, val_metrics_per_epoch = new_trainer._restore_checkpoint()  # pylint: disable=protected-access
        assert epoch == 1
        assert len(val_metrics_per_epoch) == 1
        assert isinstance(val_metrics_per_epoch[0], float)
        assert val_metrics_per_epoch[0] != 0.
        new_trainer.train()
Exemple #44
0
 def test_trainer_can_run_cuda(self):
     trainer = Trainer(self.model, self.optimizer,
                       self.iterator, self.instances, num_epochs=2,
                       cuda_device=0)
     trainer.train()
Exemple #45
0
def fine_tune_model(model: Model,
                    params: Params,
                    serialization_dir: str,
                    file_friendly_logging: bool = False) -> Model:
    """
    Fine tunes the given model, using a set of parameters that is largely identical to those used
    for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
    if it is present (as we are already given a ``Model`` here).

    The main difference between the logic done here and the logic done in ``train_model`` is that
    here we do not worry about vocabulary construction or creating the model object.  Everything
    else is the same.

    Parameters
    ----------
    archive : ``Archive``
        A saved model archive that is the result of running the ``train`` command.
    train_data_path : ``str``
        Path to the training data to use for fine-tuning.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    validation_data_path : ``str``, optional
        Path to the validation data to use while fine-tuning.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    """
    prepare_environment(params)
    os.makedirs(serialization_dir)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    if params.pop('model', None):
        logger.warning("You passed parameters for the model in your configuration file, but we "
                       "are ignoring them, using instead the model parameters in the archive.")

    vocabulary_params = params.pop('vocabulary', {})
    if vocabulary_params.get('directory_path', None):
        logger.warning("You passed `directory_path` in parameters for the vocabulary in "
                       "your configuration file, but it will be ignored. "
                       "Vocabulary from the saved model will be extended with current data.")

    all_datasets = datasets_from_params(params)

    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = model.vocab
    vocab.extend_from_instances(vocabulary_params,
                                (instance for key, dataset in all_datasets.items()
                                 for instance in dataset
                                 if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Fine-tuning interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Exemple #46
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(params.pop("vocabulary", {}),
                                   (instance for key, dataset in all_datasets.items()
                                    for instance in dataset
                                    if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model,
                                  serialization_dir,
                                  iterator,
                                  train_data,
                                  validation_data,
                                  trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
Exemple #47
0
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False,
                force: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool``, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.

    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover, force)
    prepare_global_logging(serialization_dir, file_friendly_logging)

    cuda_device = params.params.get('trainer').get('cuda_device', -1)
    if isinstance(cuda_device, list):
        for device in cuda_device:
            check_for_gpu(device)
    else:
        check_for_gpu(cuda_device)

    params.to_file(os.path.join(serialization_dir, CONFIG_NAME))

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    trainer_choice = trainer_params.pop_choice("type",
                                               Trainer.list_available(),
                                               default_to_first_choice=True)
    trainer = Trainer.by_name(trainer_choice).from_params(model=model,
                                                          serialization_dir=serialization_dir,
                                                          iterator=iterator,
                                                          train_data=train_data,
                                                          validation_data=validation_data,
                                                          params=trainer_params,
                                                          validation_iterator=validation_iterator)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')

    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir, files_to_archive=params.files_to_archive)
        raise

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    best_model.load_state_dict(best_model_state)

    if test_data and evaluate_on_test:
        logger.info("The model will be evaluated using the best epoch weights.")
        test_metrics = evaluate(
                best_model, test_data, validation_iterator or iterator,
                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
        )
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info("To evaluate on the test set after training, pass the "
                    "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)

    return best_model
Exemple #48
0
#### Finally, we can instantiate the model.
model = LstmTagger(word_embeddings, lstm, vocab)

#### Now we're ready to train the model. The first thing we'll need is an optimizer. We can just use PyTorch's stochastic gradient descent.
optimizer = optim.SGD(model.parameters(), lr=0.1)

#### And we need a <code>DataIterator</code> that handles batching for our datasets. The <code>BucketIterator</code> sorts instances by the specified fields in order to create batches with similar sequence lengths. Here we indicate that we want to sort the instances by the number of tokens in the sentence field.
iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])
#### We also specify that the iterator should make sure its instances are indexed using our vocabulary; that is, that their strings have been converted to integers using the mapping we previously created.
iterator.index_with(vocab)

#### Now we instantiate our <code>Trainer</code> and run it. Here we tell it to run for 1000 epochs and to stop training early if it ever spends 10 epochs without the validation metric improving. The default validation metric is loss (which improves by getting smaller), but it's also possible to specify a different metric and direction (e.g. accuracy should get bigger).
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000)

#### When we launch it it will print a progress bar for each epoch that includes both the "loss" and the "accuracy" metric. If our model is good, the loss should go down and the accuracy up as we train.
trainer.train()

#### As in the original PyTorch tutorial, we'd like to look at the predictions our model generates. AllenNLP contains a <code>Predictor</code> abstraction that takes inputs, converts them to instances, feeds them through your model, and returns JSON-serializable results. Often you'd need to implement your own Predictor, but AllenNLP already has a <code>SentenceTaggerPredictor</code> that works perfectly here, so we can use it. It requires our model (for making predictions) and a dataset reader (for creating instances).
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
#### It has a <code>predict</code> method that just needs a sentence and returns (a JSON-serializable version of) the output dict from forward.  Here <code>tag_logits</code> will be a (5, 3) array of logits, corresponding to the 3 possible tags for each of the 5 words.
tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
#### To get the actual "predictions" we can just take the <code>argmax</code>.
tag_ids = np.argmax(tag_logits, axis=-1)
#### And then use our vocabulary to find the predicted tags.
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])