Exemple #1
0
    if opts.gpu > -1:
        model.cuda(opts.gpu)

    train_dataset, dev_dataset = split_to_train_and_dev(
        dataset, opts.train_ratio)

    optimizer = BertAdam(model.parameters(), lr=opts.lr)
    iterator = BucketIterator(batch_size=opts.batch_size,
                              sorting_keys=[("text", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      patience=opts.patience,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      validation_metric='+accuracy',
                      cuda_device=opts.gpu,
                      serialization_dir=opts.save_dir,
                      num_epochs=opts.epoch)
    trainer.train()

if opts.eval:
    vocab = Vocabulary.from_files(os.path.join(opts.save_dir, VOCAB_DIR))

    model = BertForMultiTaskSLU(vocab, opts.bert)
    model.load_state_dict(
        torch.load(os.path.join(opts.save_dir, BEST_MODEL_FILENAME),
                   map_location=device_mapping(opts.gpu)))

    predictor = SLUPredict(model, reader, vocab)
def main():
    # In order to use ELMo, each word in a sentence needs to be indexed with
    # an array of character IDs.
    elmo_token_indexer = ELMoTokenCharactersIndexer()
    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # Initialize the ELMo-based token embedder using a pre-trained file.
    # This takes a while if you run this script for the first time

    # Original
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

    # Medium
    # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
    # weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Pass in the ElmoTokenEmbedder instance instead
    embedder = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
    elmo_embedding_dim = 256
    lstm = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(embedder, lstm, vocab)
    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()

    tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
    predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
    logits = predictor.predict(tokens)['logits']
    label_id = np.argmax(logits)

    print(model.vocab.get_token_from_index(label_id, 'labels'))
# HW
encoder = TransformerSeq2VecEncoder(EMBEDDING_DIM,
                                    HIDDEN_DIM,
                                    projection_dim=256,
                                    feedforward_hidden_dim=128,
                                    num_layers=2,
                                    num_attention_heads=4)

model = LstmClassifier(word_embeddings, encoder, vocab)
model.cuda()

optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

iterator = BucketIterator(batch_size=64,
                          sorting_keys=[("tokens", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  cuda_device=0,
                  patience=5,
                  num_epochs=15)

metrics = trainer.train()

print(metrics)
Exemple #4
0
def main():
    target_namespace = "target_tokens"
    if not USE_COPY:
        reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace=target_namespace)
            })
    else:
        reader = CopyNetDatasetReader(
            source_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_tokenizer=WordTokenizer(
                word_splitter=JustSpacesWordSplitter()),
            target_namespace=target_namespace)
    train_dataset = reader.read('./data/data_train.tsv')
    validation_dataset = reader.read('./data/data_val.tsv')

    vocab = Vocabulary.from_instances(train_dataset,
                                      min_count={
                                          'tokens': 3,
                                          'target_tokens': 3
                                      })

    en_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=SRC_EMBEDDING_DIM,
        pretrained_file="../opennmt/glove_dir/glove.840B.300d.txt")
    assert en_embedding.weight.requires_grad
    datas = _read_pretrained_embeddings_file(en_embedding._pretrained_file,
                                             SRC_EMBEDDING_DIM, vocab)
    datas.requires_grad = True
    en_embedding.weight.data = datas
    print(en_embedding.weight.data)
    assert en_embedding.weight.requires_grad
    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(SRC_EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      bidirectional=True,
                      dropout=0.3,
                      num_layers=1))
    #encoder = StackedSelfAttentionEncoder(input_dim=SRC_EMBEDDING_DIM,
    #                                      hidden_dim=HIDDEN_DIM,
    #                                      projection_dim=128, feedforward_hidden_dim=128,
    #                                      num_layers=1, num_attention_heads=8)

    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
    attention = DotProductAttention()

    if not USE_COPY:
        model = SimpleSeq2Seq(vocab,
                              source_embedder,
                              encoder,
                              MAX_DECODING_STEPS,
                              target_embedding_dim=TGT_EMBEDDING_DIM,
                              target_namespace='target_tokens',
                              attention=attention,
                              beam_size=8,
                              use_bleu=True)
    else:
        model = MyCopyNet(vocab,
                          source_embedder,
                          encoder,
                          max_decoding_steps=MAX_DECODING_STEPS,
                          target_embedding_dim=TGT_EMBEDDING_DIM,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=8,
                          tgt_embedder_pretrain_file=
                          "../opennmt/glove_dir/glove.840B.300d.txt")
    model.to(torch.device('cuda'))
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[("source_tokens", "num_tokens")],
                              padding_noise=0.2)

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=22,
                      patience=4,
                      serialization_dir="./checkpoints",
                      cuda_device=CUDA_DEVICE,
                      summary_interval=100)
    trainer.train()
    print(en_embedding.weight.data)
    predictor = Seq2SeqPredictor(model, reader)

    # Dump all predictions to a file
    # TODO (DNGros): Is there an automatic way in allennlp to do this??
    pred_toks = []
    with open("pred.txt", "w") as outfile:
        for instance in tqdm(validation_dataset):
            pred = predictor.predict_instance(instance)
            toks = pred['predicted_tokens']
            if toks:
                outfile.write(" ".join(toks[0]) + "\n")
            else:
                outfile.write("" + "\n")
#### Here we indicate that we want to sort the instances by the number of tokens in the sentence field.
iterator = BucketIterator(batch_size=2,
                          sorting_keys=[("sentence", "num_tokens")])
#### We also specify that the iterator should make sure its instances are indexed using our vocabulary;
#### that is, that their strings have been converted to integers using the mapping we previously created.
iterator.index_with(vocab)

#### Now we instantiate our <code>Trainer</code> and run it.
#### Here we tell it to run for 1000 epochs and to stop training early
#### if it ever spends 10 epochs without the validation metric improving.
#### The default validation metric is loss (which improves by getting smaller),
#### but it's also possible to specify a different metric and direction (e.g. accuracy should get bigger).
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000)

#### When we launch it it will print a progress bar for each epoch
#### that includes both the "loss" and the "accuracy" metric.
#### If our model is good, the loss should go down and the accuracy up as we train.
trainer.train()

#### As in the original PyTorch tutorial, we'd like to look at the predictions our model generates.
#### AllenNLP contains a <code>Predictor</code> abstraction that takes inputs,
#### converts them to instances, feeds them through your model,
#### and returns JSON-serializable results. Often you'd need to implement your own Predictor,
#### but AllenNLP already has a <code>SentenceTaggerPredictor</code> that works perfectly here, so we can use it.
#### It requires our model (for making predictions) and a dataset reader (for creating instances).
def run(args):
    ALL_DATASET_PATHS = get_all_dataset_paths(args.dataset_paths_file,
                                              args.dataset_path_prefix)
    SELECTED_TASK_NAMES = args.task
    PROJECTION_DIM = args.proj_dim
    HIDDEN_DIM = args.hidden_dim
    # BIDIRECTIONAL=True
    # INTERMEDIATE_INPUT=2*HIDDEN_DIM if BIDIRECTIONAL else HIDDEN_DIM
    DROPOUT = args.dropout
    LR = args.lr
    WEIGHT_DECAY = args.weight_decay
    BATCH_SIZE = args.batch_size
    NUM_EPOCHS = args.epochs
    PATIENTCE = args.patience
    SERIALIZATION_DIR = args.model_dir
    CLEAN_MODEL_DIR = args.clean_model_dir
    CUDA_DEVICE = cuda_device(args.cuda)
    TEST_MODE = args.test_mode
    # device = torch.device(f"cuda:{CUDA_DEVICE}" if torch.cuda.is_available() and args.cuda else "cpu")

    TASKS = [TASK_CONFIGS[task_name] for task_name in SELECTED_TASK_NAMES]
    dataset_paths = {
        task_name: ALL_DATASET_PATHS[task_name]
        for task_name in SELECTED_TASK_NAMES
    }

    tag_namespace_hashing_fn = {
        tag_namespace: i
        for i, tag_namespace in enumerate(TASK_CONFIGS.keys())
    }.get

    elmo_token_indexer = ELMoTokenCharactersIndexer()
    token_indexers = {"tokens": elmo_token_indexer}
    readers = {}
    for task in TASKS:
        if task.task_type in TAGGING_TASKS:
            readers[task.tag_namespace] = ConLLDatasetReader(
                task.tag_namespace,
                token_indexers=token_indexers,
                tag_namespace_hashing_fn=tag_namespace_hashing_fn,
                lazy=True)
        elif task.task_type in CLASSIFICATION_TASKS:
            readers[task.tag_namespace] = JSONDatasetReader(
                task.tag_namespace,
                token_indexers=token_indexers,
                tag_namespace_hashing_fn=tag_namespace_hashing_fn,
                lazy=True)
        else:
            raise NotImplementedError(
                f"task_namespace={task.task_type} not yet supported.")

    elmo_embedder = ElmoTokenEmbedder(
        options_file,
        weight_file,
        requires_grad=False,
        dropout=DROPOUT,
        projection_dim=PROJECTION_DIM,
    )
    # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=3)

    # Pass in the ElmoTokenEmbedder instance instead
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
    ELMO_EMBEDDING_DIM = elmo_embedder.get_output_dim()

    # POS -> CHUNK -> NER
    task_suffixes = set(
        [task_name.rsplit("_", 1)[-1] for task_name in SELECTED_TASK_NAMES])
    encoders = get_task_encoder_dict(args, task_suffixes, ELMO_EMBEDDING_DIM)

    if not TEST_MODE:
        train_dataset = read_datasets(dataset_paths,
                                      readers,
                                      data_split="train")
        validation_dataset = read_datasets(dataset_paths,
                                           readers,
                                           data_split="dev")
        vocab = create_classification_tagging_vocab(
            [train_dataset, validation_dataset])

        # Special case for CCG
        if "ccg" in task_suffixes or "pos" in task_suffixes:
            for task in TASKS:
                if task.task_type == "ccg":
                    for tag in ["B-NOUN.SHAPE", "I-NOUN.PROCESS"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)
                if task.tag_namespace == "ud_pos":
                    for tag in ["CONJ"]:
                        vocab.add_token_to_namespace(tag, task.tag_namespace)
    else:
        vocab = Vocabulary.from_files(
            os.path.join(SERIALIZATION_DIR, "vocabulary"))

    # encoder = PassThroughEncoder(ELMO_EMBEDDING_DIM)
    model = MultiTaskCRFTaggerAndClassifier(word_embeddings, encoders, vocab,
                                            TASKS)
    model = model.cuda(device=CUDA_DEVICE)

    if not TEST_MODE:
        iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                  batch_size=BATCH_SIZE,
                                                  cache_instances=True)
        iterator.index_with(vocab)

        if CLEAN_MODEL_DIR:
            if os.path.exists(SERIALIZATION_DIR):
                logger.info(f"Deleting {SERIALIZATION_DIR}")
                shutil.rmtree(SERIALIZATION_DIR)
            logger.info(f"Creating {SERIALIZATION_DIR}")
            os.makedirs(SERIALIZATION_DIR)

        logger.info(
            f"Writing arguments to arguments.json in {SERIALIZATION_DIR}")
        with open(os.path.join(SERIALIZATION_DIR, "arguments.json"),
                  "w+") as fp:
            json.dump(vars(args), fp, indent=2)

        logger.info(f"Writing vocabulary in {SERIALIZATION_DIR}")
        vocab.save_to_files(os.path.join(SERIALIZATION_DIR, "vocabulary"))
        # Use list to ensure each epoch is a full pass through the data
        combined_training_dataset = list(
            roundrobin_iterator(*train_dataset.values()))
        combined_validation_dataset = list(
            roundrobin_iterator(*validation_dataset.values()))

        # optimizer = optim.ASGD(model.parameters(), lr=0.01, t0=100, weight_decay=0.1)
        optimizer = optim.Adam(model.parameters(),
                               lr=LR,
                               weight_decay=WEIGHT_DECAY)

        training_stats = []
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=combined_training_dataset,
            validation_dataset=combined_validation_dataset,
            patience=PATIENTCE,
            num_epochs=NUM_EPOCHS,
            cuda_device=CUDA_DEVICE,
            serialization_dir=SERIALIZATION_DIR,
            # model_save_interval=600
        )
        stats = trainer.train()
        training_stats.append(stats)

        with open(os.path.join(SERIALIZATION_DIR, "training_stats.json"),
                  "w+") as fp:
            json.dump(training_stats, fp, indent=2)
    else:
        model.load_state_dict(
            torch.load(os.path.join(SERIALIZATION_DIR, "best.th")))
        model = model.cuda(device=CUDA_DEVICE)

    # Empty cache to ensure larger batch can be loaded for testing
    torch.cuda.empty_cache()
    # Also garbage collect
    gc.collect()

    test_filepaths = {
        task.tag_namespace: dataset_paths[task.tag_namespace]["test"]
        for task in TASKS
    }

    logger.info("Evaluating on test data")

    test_iterator = CustomHomogeneousBatchIterator(partition_key="dataset",
                                                   batch_size=BATCH_SIZE * 2)
    test_iterator.index_with(vocab)
    model = model.eval()
    test_stats = evaluate_multiple_data(model,
                                        readers,
                                        test_iterator,
                                        test_filepaths,
                                        cuda_device=CUDA_DEVICE)
    with open(os.path.join(SERIALIZATION_DIR, "test_stats.json"), "w+") as fp:
        json.dump(test_stats, fp, indent=2)
def train(train_data_path,
          validation_data_path,
          embedding_dim,
          hidden_dim,
          learning_rate=0.1,
          batch_size=2,
          num_epochs=100,
          save_dir="/tmp"):
    _train_data_path = cached_path(train_data_path)
    _validation_data_path = cached_path(validation_data_path)

    reader = PosDatasetReader()
    train_dataset = reader.read(_train_data_path)
    validation_dataset = reader.read(_validation_data_path)
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=embedding_dim)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=num_epochs,
                      cuda_device=cuda_device)
    metrics = trainer.train()
    for m in metrics:
        if m.startswith("validation"):
            print("{}={}".format(m, metrics[m]))

    predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
    tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
    tag_ids = np.argmax(tag_logits, axis=-1)
    print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

    # Here's how to save the model.
    model_path = os.path.join(save_dir, "model.th")
    vocab_path = os.path.join(save_dir, "vocabulary")
    with open(model_path, 'wb') as f:
        torch.save(model.state_dict(), f)
    vocab.save_to_files(vocab_path)

    # And here's how to reload the model.
    vocab2 = Vocabulary.from_files(vocab_path)
    model2 = LstmTagger(word_embeddings, lstm, vocab2)
    with open(model_path, 'rb') as f:
        model2.load_state_dict(torch.load(f))
    if cuda_device > -1:
        model2.cuda(cuda_device)

    predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
    tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']
    np.testing.assert_array_almost_equal(tag_logits2, tag_logits)
Exemple #8
0
                           eps=1e-9)
    scheduler = NoamLR(optimizer=optimizer,
                       model_size=HIDDEN_DIM,
                       warmup_steps=WARMUP_STEPS,
                       factor=1)

    iterator = BucketIterator(batch_size=BATCH_SIZE,
                              sorting_keys=[("sentence", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      learning_rate_scheduler=scheduler,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=PATIENCE,
                      num_epochs=EPOCH,
                      cuda_device=cuda_device)

    trainer.train()

    # Here's how to save the model.
    with open("model.th", 'wb') as f:
        torch.save(model.state_dict(), f)
    vocab.save_to_files("vocabulary")

    # # And here's how to reload the model.
    # vocab2 = Vocabulary.from_files("vocabulary")
    # model2 = BiLSTMClassifier(word_embeddings, lstm, DROPOUT_RATE, vocab)
#just using toydatasetreader to build vocab
reader = ToyDatasetReader()
dataset = reader.read("")
vocab = Vocabulary.from_instances(dataset)

print(vocab.get_vocab_size())

reader = TestReader(vocab)
reader.set_compute_nnrank_features(False)
dataset = reader.read("")

opts = ModelOptions()
text_embedder = Text_Embedding(opts, vocab)
paper_embedder = Paper_Embedding()
embedder = EmbeddingModel(vocab, text_embedder, paper_embedder)

iterator = BasicIterator()
iterator.index_with(vocab)

optimizer = torch.optim.SGD(embedder.parameters(), lr=0.1)

trainer = Trainer(model=embedder,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=dataset,
                  validation_dataset=dataset,
                  patience=1000,
                  num_epochs=10,
                  summary_interval=2)

trainer.train()
Exemple #10
0
def EnhancedRCNN_train():

    print("enter train")
    with open (model_config.glove_file_path) as fp:
        text = fp.readlines()

    # 这里如何优雅地解决这个初始counter的问题
    glove_lines = len(text)
    token_counts = {"tokens": dict([(line.split(' ')[0], glove_lines - idx + 2) for idx, line in enumerate(text)])}
    #print(list(token_counts.items())[:10])
    vocab = Vocabulary(counter=token_counts,
                        min_count={"tokens": 1},
                        #non_padded_namespaces=['tokens'],
                        pretrained_files={'tokens': model_config.glove_file_path},
                        only_include_pretrained_words=True)

    EMBEDDING_DIM = 300
    token_embedding = Embedding.from_params(
        vocab=vocab,
        params=Params({ 'trainable': False,
                        'pretrained_file': model_config.glove_file_path,
                        'embedding_dim': EMBEDDING_DIM,
                        'vocab_namespace': "tokens"})
    )

    print("GloVe loaded")
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    model = EnhancedRCNNModel(word_embeddings, model_config.num_class, vocab=vocab)

    if torch.cuda.is_available():
        cuda_device = list(range(torch.cuda.device_count()))

        model = model.cuda(cuda_device[0])
    else:
        cuda_device = -1
    print("cuda device : {}".format(cuda_device))

    reader = ListWiseDatasetReader(vocab=vocab)
    train_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_train.jsonl"))
    dev_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_dev.jsonl"))
    test_dataset = reader.read(os.path.join(model_config.snli_base_path, "snli_1.0_test.jsonl"))

    #fc_lr = 1e-3
    optimizer = torch.optim.SGD(model.parameters(), lr=model_config.learning_rate, momentum=0.9)
    '''
    optimizer = torch.optim.SGD([{'params': model.embedder.parameters()},
                                 {'params': model.fc1.parameters(), 'lr': fc_lr},
                                 {'params': model.fc2.parameters(), 'lr': fc_lr},
                                 {'params': model.proj_1.parameters(), 'lr': fc_lr},
                                 {'params': model.proj_2.parameters(), 'lr': fc_lr},
                                 {'params': model.bert_prediction.parameters(), 'lr': fc_lr},
                                 ], lr=model_config.learning_rate, momentum=0.9)
    '''
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    iterator_train = BucketIterator(batch_size=model_config.batch_size,
                                    sorting_keys=[("left_input_tokens_field", "num_tokens"),
                                                  ("right_input_tokens_field", "num_tokens")])
    iterator_train.index_with(vocab)

    model.train()
    trainer = Trainer(model = model,
                      optimizer = optimizer,
                      iterator = iterator_train,
                      train_dataset = train_dataset,
                      validation_dataset = dev_dataset,
                      patience = model_config.patience,
                      num_epochs = model_config.epochs,
                      cuda_device = cuda_device,
                      shuffle=True
                      )
    train_start_time = time.time()
    trainer.train()
    train_end_time = time.time()

    # test
    model.eval()

    preds = []
    gd = []
    gd_pos = []

    with torch.no_grad():
        iterator_test = BucketIterator(batch_size = model_config.batch_size,
                                       sorting_keys=[("left_input_tokens_field", "num_tokens"),
                                                  ("right_input_tokens_field", "num_tokens")])
        iterator_test.index_with(vocab)
        generator_test = iterator_test(test_dataset, 1, False)
        test_start_time = time.time()
        for batch in generator_test:
            batch = move_to_device(batch, cuda_device[0])
            gd.extend(batch['label'].squeeze(-1).long().cpu().numpy().tolist())
            out_dict = model(batch['left_input_tokens_field'], batch['right_input_tokens_field'],
                             batch['label'])
            batch_pred = torch.argmax(out_dict['logits'], -1).cpu().numpy()
            preds.extend(batch_pred.tolist())

            sorted_batch, sorted_idx = torch.sort(out_dict['logits'], dim=-1, descending=True)
            label_mat = batch['label'].repeat(1, out_dict['logits'].shape[-1]).long().cuda()
            pos_mat = label_mat.eq(sorted_idx.cuda())
            pos_tensor = pos_mat.nonzero()[:, 1].cpu().numpy().tolist()

            gd_pos.extend(pos_tensor)
        test_end_time = time.time()

    print("p@1 : ", (np.sum(np.equal(gd, preds))) / len(gd))
    print("[train time] : {}".format(train_end_time - train_start_time))
    print("[test time] : {}".format(test_end_time - test_start_time))
    # 先检查文件是否存在,不存在则写入,存在则continue
    save_path = os.path.join(root_path, model_config.save_path)
    if os.path.exists(save_path):
        print("save path already exists")
    else:
        pd = pandas.DataFrame({'gd': gd, 'preds': preds})
        pd.to_csv(save_path, index=False)
        print("save to path : {}".format(save_path))
Exemple #11
0
tokens = batch["tokens"]
labels = batch

mask = get_text_field_mask(tokens)
mask
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)
class_logits

model(**batch)

loss = model(**batch)["loss"]

loss.backward()
optimizer = optim.Adam(model.parameters(), lr=config.lr)

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    cuda_device=0 if USE_GPU else -1,
    num_epochs=config.epochs,
)

metrics = trainer.train()

tagger = SentenceTaggerPredictor(model, reader)
tagger.predict("this tutorial was great!")
Exemple #12
0
    embed_training_reader.set_compute_nnrank_features(False)
    embed_training_data = embed_training_reader.read("")

    rank_training_reader = CiteomaticReader(df,
                                            idx_to_id_dict,
                                            ann,
                                            train_frac=train_frac,
                                            validation=False)
    rank_training_reader.set_compute_nnrank_features(True)
    rank_training_data = embed_training_reader.read("")

    embed_trainer = Trainer(
        model=embedder,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=embed_training_data,
        #validation_dataset=val_data,
        patience=10,
        num_epochs=1,
        shuffle=False,
        cuda_device=cuda_device)

    rank_trainer = Trainer(
        model=ranker,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=rank_training_data,
        #validation_dataset=val_data,
        patience=10,
        num_epochs=1,
        shuffle=False,
        cuda_device=cuda_device)
Exemple #13
0
def _build_trainer(config, model, vocab, train_data, valid_data):
    optimizer = optim.AdamW(model.parameters(), lr=config.trainer.lr)
    scheduler = None

    is_bert_based = any(
        model.name.endswith('bert') for model in config.embedder.models)
    is_trainable_elmo_based = any(
        model.name == 'elmo' and model.params['requires_grad']
        for model in config.embedder.models)

    if is_bert_based or is_trainable_elmo_based:

        def _is_pretrained_param(name):
            return 'transformer_model' in name or '_elmo_lstm' in name

        pretrained_params, non_pretrained_params = [], []
        for name, param in model.named_parameters():
            if _is_pretrained_param(name):
                logger.info('Pretrained param: %s', name)
                pretrained_params.append(param)
            else:
                logger.info('Non-pretrained param: %s', name)
                non_pretrained_params.append(param)

        optimizer = optim.AdamW([{
            'params': pretrained_params,
            'lr': config.trainer.bert_lr
        }, {
            'params': non_pretrained_params,
            'lr': config.trainer.lr
        }, {
            'params': []
        }])

        scheduler = SlantedTriangular(
            optimizer=optimizer,
            num_epochs=config.trainer.num_epochs,
            num_steps_per_epoch=len(train_data) / config.trainer.batch_size,
            cut_frac=config.trainer.cut_frac,
            gradual_unfreezing=config.trainer.gradual_unfreezing,
            discriminative_fine_tuning=config.trainer.
            discriminative_fine_tuning)

    logger.info('Trainable params:')
    for name, param in model.named_parameters():
        if param.requires_grad:
            logger.info('\t' + name)

    iterator = BucketIterator(batch_size=config.trainer.batch_size)
    iterator.index_with(vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
        logger.info('Using cuda')
    else:
        cuda_device = -1
        logger.info('Using cpu')

    logger.info('Example batch:')
    _log_batch(next(iterator(train_data)))

    if is_bert_based:
        train_data = _filter_data(train_data, vocab)
        valid_data = _filter_data(valid_data, vocab)

    return Trainer(model=model,
                   optimizer=optimizer,
                   iterator=iterator,
                   train_dataset=train_data,
                   validation_dataset=valid_data,
                   validation_metric='+MeanAcc',
                   patience=config.trainer.patience,
                   num_epochs=config.trainer.num_epochs,
                   cuda_device=cuda_device,
                   grad_clipping=5.,
                   learning_rate_scheduler=scheduler,
                   serialization_dir=os.path.join(config.data.models_dir,
                                                  config.model_name),
                   should_log_parameter_statistics=False,
                   should_log_learning_rate=False,
                   num_gradient_accumulation_steps=config.trainer.
                   num_gradient_accumulation_steps)
reader = ClassifierDatasetReader()
train_dataset = reader.read(cached_path(config.train_data_dir))
validation_dataset = reader.read(cached_path(config.dev_data_dir))
# tokens索引使用预训练语言模型, labels索引使用了这个vocab
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

model = BertClassifier(
    vocab=vocab,
    bert_model=config.bert_model_dir,
)

if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.SGD(model.parameters(), lr=config.learning_rate)

iterator = BucketIterator(batch_size=config.train_batch_size)
iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=config.early_stop,
                  num_epochs=config.num_train_epochs,
                  cuda_device=cuda_device)
trainer.train()
                               out_features=vocab.get_vocab_size('labels'))

model = TargetLSTMClassifier(vocab, word_embeddings, text_lstm, target_lstm, feed_forward)

# Data iterator
sort_fields = [("text", "num_tokens"), ("target", "num_tokens")]
iterator = BucketIterator(batch_size=32, sorting_keys=sort_fields)
iterator.index_with(vocab)

# Model training
optimizer = optim.Adam(model.parameters())

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=40,
                  histogram_interval=100, should_log_learning_rate=True)

serialization_dir = '/tmp/anything100'
another_log = SummaryWriter(os.path.join(serialization_dir, "log", "embeddings"))
train_log = SummaryWriter(os.path.join(serialization_dir, "log", "train"))
validation_log = SummaryWriter(os.path.join(serialization_dir, "log", "validation"))


trainer._tensorboard = TensorboardWriter(train_log=train_log, validation_log=validation_log)

trainer.train()
# Project the learnt word embeddings
another_log.add_embedding(token_embedding.weight, metadata=token_names, 
Exemple #16
0
    # get a new model for each iteration.
    model, optimizer, cuda_device = get_model(pretrained_file, WORD_EMB_DIM,
                                              vocab, len(reader.alltags))

    iterator = BasicIterator(batch_size=batch_size)
    iterator.index_with(vocab)

    ser_dir_iter = serialization_dir + "/iter-{}".format(iteration)
    prepare_global_logging(ser_dir_iter, False)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        patience=10,
        num_epochs=25,  # FIXME: consider more iterations.
        validation_metric="+f1-measure-overall",
        cuda_device=cuda_device,
        num_serialized_models_to_keep=3,
        serialization_dir=ser_dir_iter)

    metrics = trainer.train()

    print("tagging training data...")
    for inst in tqdm(train_dataset):
        model.eval()
        output = model.forward_on_instance(inst)
        seq_len, num_tags = output["logits"].shape

        orig_tags = inst["metadata"]["orig_tags"]
def train(train_dataset, val_dataset, cfg):
    # Vocabularyを生成
    VOCAB_SIZE = cfg.w2v.vocab_size
    vocab = Vocabulary.from_instances(train_dataset + val_dataset,
                                      max_vocab_size=VOCAB_SIZE)

    BATCH_SIZE = cfg.training.batch_size

    # パディング済みミニバッチを生成してくれるIterator
    iterator = BucketIterator(batch_size=BATCH_SIZE,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    # 東北大が提供している学習済み日本語 Wikipedia エンティティベクトルを使用する
    # http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/
    model_name = cfg.w2v.model_name
    norm = cfg.w2v.norm
    cwd = hydra.utils.get_original_cwd()
    params = Params({
        'embedding_dim':
        200,
        'padding_index':
        0,
        'pretrained_file':
        os.path.join(cwd, f'embs/jawiki.{model_name}_vectors.200d.txt'),
        'norm_type':
        norm
    })

    token_embedding = Embedding.from_params(vocab=vocab, params=params)
    HIDDEN_SIZE = cfg.model.hidden_size
    dropout = cfg.model.dropout

    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
        {"tokens": token_embedding})
    encoder: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
        nn.LSTM(word_embeddings.get_output_dim(),
                HIDDEN_SIZE,
                bidirectional=True,
                batch_first=True))
    model = ClassifierWithAttn(word_embeddings, encoder, vocab, dropout)
    model.train()

    USE_GPU = True

    if USE_GPU and torch.cuda.is_available():
        model = model.cuda(0)

    LR = cfg.training.learning_rate
    EPOCHS = cfg.training.epoch
    patience = cfg.training.patience if cfg.training.patience > 0 else None

    optimizer = optim.Adam(model.parameters(), lr=LR)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=val_dataset,
                      patience=patience,
                      cuda_device=0 if USE_GPU else -1,
                      num_epochs=EPOCHS)
    metrics = trainer.train()
    logger.info(metrics)

    return model, metrics
Exemple #18
0
#                         # {'params': model.text_field_embedder.token_embedder_tokens.bert_model.encoder.layer[8].parameters(), 'lr': 0.000855}
#                         ], lr=1e-4)
# Default
# optimizer = optim.SGD(model.parameters(), lr=0.001)

model = model.cuda()

print('Start training')

# Old trainer
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    validation_iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    patience=5,
    validation_metric='-loss',
    num_epochs=cfg.num_epochs,
    cuda_device=[0, 1]
)

# New trainer
# trainer = Trainer(
#     model=model,
#     optimizer=optimizer,
#     iterator=iterator,
#     validation_iterator=iterator,
#     train_dataset=train_dataset,
#     validation_dataset=validation_dataset,
#     validation_metric='-loss',
dataset = reader.read("")
vocab = Vocabulary.from_instances(dataset)

print(vocab.get_vocab_size())

opts = ModelOptions()

reader = TestReader(vocab)
reader.set_compute_nnrank_features(True)
dataset = reader.read("")
text_embedder = Text_Embedding(opts, vocab)

nnrank = CitationRanker(vocab, opts, text_embedder)

iterator = BasicIterator()
iterator.index_with(vocab)

optimizer = torch.optim.SGD(nnrank.parameters(), lr=0.1)
move_optimizer_to_cuda(optimizer)

trainer = Trainer(model=nnrank,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=dataset,
                  validation_dataset=dataset,
                  patience=1000,
                  num_epochs=10,
                  summary_interval=2,
                  cuda_device=0)

trainer.train()
Exemple #20
0
def objective_kw(
    num_epochs=10,
    lr=0.1,
    lr_gamma=0.25,
    EMBEDDING_DIM=16,
    HIDDEN_DIM=6,
    DROPOUT=0.5,
    AUGMENT=True,
    weight_exponent=1.0,
):

    weights = label_counts.map(lambda x: x**
                               (-1 / (1 + weight_exponent))).loc[labels]
    weights = weights / ((label_counts * weights).mean() / label_counts.mean())

    loss_params = dict(alpha=weights.values, gamma=None)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM,
                      HIDDEN_DIM,
                      batch_first=True,
                      bidirectional=True,
                      dropout=DROPOUT))

    model = LstmTagger(word_embeddings, lstm, vocab, loss_params=loss_params)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    if AUGMENT:
        iterator = AdvancedBucketIterator(
            batch_size=2,
            sorting_keys=[("sentence", "num_tokens")],
            preprocess=partial(permute_token, frequency=0.2),
        )
        iterator.index_with(vocab)

        val_iterator = AdvancedBucketIterator(
            batch_size=2,
            sorting_keys=[("sentence", "num_tokens")],
        )
        val_iterator.index_with(vocab)
    else:
        val_iterator = iterator

    for _ in range(1):
        optimizer = optim.SGD(model.parameters(), lr=lr)

        learning_rate_scheduler = _PyTorchLearningRateSchedulerWrapper(
            MultiStepLR(optimizer, [10, 20, 40], gamma=lr_gamma,
                        last_epoch=-1))

        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            validation_iterator=val_iterator,
            train_dataset=datasets['train_no_punctuation'] + datasets['train'],
            validation_dataset=datasets['val'],
            patience=10,
            num_epochs=num_epochs,
            learning_rate_scheduler=learning_rate_scheduler,
            #                           model_save_interval=10,
            #                       serialization_dir=serialization_dir,
            #                       num_serialized_models_to_keep=10,
        )
        res = trainer.train()
        return 1 - res['validation_accuracy']  # res['validation_loss']
Exemple #21
0
def run_experiment(
    use_soft_targets, soft_target_path, embedding_type, rnn_type, hparams
):
    log = {}
    log["name"] = "{} {} {}".format(
        rnn_type, embedding_type, "soft_target" if use_soft_targets else "hard_target"
    )
    log["soft_target"] = soft_target_path if use_soft_targets else None

    vocab = Vocabulary().from_files(hparams["vocab_path"])
    if embedding_type == "Chord":
        # data reader
        reader = CpmDatasetReader()

        # chord embedder
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size("tokens"),
            embedding_dim=hparams["chord_token_embedding_dim"],
        )
        chord_embedder = BasicTextFieldEmbedder({"tokens": token_embedding})

    elif embedding_type == "Note":
        # data reader
        note_tokenizer = NoteTokenizer()
        note_indexer = TokenCharactersIndexer(
            namespace="notes", min_padding_length=4, character_tokenizer=note_tokenizer
        )
        reader = CpmDatasetReader(
            token_indexers={"tokens": SingleIdTokenIndexer(), "notes": note_indexer}
        )

        # chord embedder
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size("tokens"),
            embedding_dim=hparams["chord_token_embedding_dim"],
        )
        note_token_embedding = Embedding(
            vocab.get_vocab_size("notes"), hparams["note_embedding_dim"]
        )
        note_encoder = CnnEncoder(
            num_filters=hparams["cnn_encoder_num_filters"],
            ngram_filter_sizes=hparams["cnn_encoder_n_gram_filter_sizes"],
            embedding_dim=hparams["note_embedding_dim"],
            output_dim=hparams["note_level_embedding_dim"],
        )
        note_embedding = TokenCharactersEncoder(note_token_embedding, note_encoder)
        chord_embedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding, "notes": note_embedding}
        )
    else:
        raise ValueError("Unknown embedding type:", embedding_type)

    # read data
    train_dataset = reader.read(os.path.join(hparams["data_path"], "train.txt"))
    val_dataset = reader.read(os.path.join(hparams["data_path"], "val.txt"))
    test_dataset = reader.read(os.path.join(hparams["data_path"], "test.txt"))

    # contextualizer
    contextual_input_dim = chord_embedder.get_output_dim()
    if rnn_type == "RNN":
        contextualizer = PytorchSeq2SeqWrapper(
            torch.nn.RNN(
                contextual_input_dim,
                hparams["rnn_hidden_dim"],
                batch_first=True,
                bidirectional=False,
            )
        )
    elif rnn_type == "LSTM":
        contextualizer = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(
                contextual_input_dim,
                hparams["lstm_hidden_dim"],
                batch_first=True,
                bidirectional=False,
            )
        )
    elif rnn_type == "GRU":
        contextualizer = PytorchSeq2SeqWrapper(
            torch.nn.GRU(
                contextual_input_dim,
                hparams["gru_hidden_dim"],
                batch_first=True,
                bidirectional=False,
            )
        )
    else:
        raise ValueError("Unknown rnn type:", rnn_type)

    if use_soft_targets:
        vocab_size = vocab.get_vocab_size("tokens")
        soft_targets = Embedding(
            num_embeddings=vocab_size,
            embedding_dim=vocab_size,
            weight=torch.load(soft_target_path),
            trainable=False,
        )
    else:
        soft_targets = None

    iterator = BucketIterator(
        batch_size=hparams["batch_size"], sorting_keys=[("input_tokens", "num_tokens")]
    )
    iterator.index_with(vocab)

    batches_per_epoch = math.ceil(len(train_dataset) / hparams["batch_size"])

    model_hparams = {
        "dropout": None,
        "soft_targets": soft_targets,
        "T_initial": hparams["T_initial"],
        "decay_rate": hparams["decay_rate"],
        "batches_per_epoch": batches_per_epoch,
    }
    # chord progression model
    model = Cpm(vocab, chord_embedder, contextualizer, model_hparams)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
        print("GPU available.")
    else:
        cuda_device = -1

    optimizer = optim.Adam(model.parameters(), lr=hparams["lr"])

    ts = time.gmtime()
    saved_model_path = os.path.join(
        hparams["saved_model_path"], time.strftime("%Y-%m-%d %H-%M-%S", ts)
    )
    serialization_dir = os.path.join(saved_model_path, "checkpoints")

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=val_dataset,
        serialization_dir=serialization_dir,
        patience=hparams["patience"],
        num_epochs=hparams["num_epochs"],
        cuda_device=cuda_device,
    )
    trainer.train()
    saved_model_path = os.path.join(saved_model_path, "{}.th".format(log["name"]))
    torch.save(model.state_dict(), saved_model_path)

    predictor = Predictor(model=model, iterator=iterator, cuda_device=cuda_device)
    pred_metrics = predictor.predict(test_dataset)
    log["metrics"] = pred_metrics
    log["saved_mode_path"] = saved_model_path

    return log
def main():
    parser = argparse.ArgumentParser(description='Evidence Inference experiments')
    parser.add_argument('--cuda_device', type=int, default=0,
                        help='GPU number (default: 0)')
    parser.add_argument('--epochs', type=int, default=2,
                        help='upper epoch limit (default: 2)')
    parser.add_argument('--patience', type=int, default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='batch size (default: 32)')
    parser.add_argument('--dropout', type=float, default=0.2,
                        help='dropout for the model (default: 0.2)')
    parser.add_argument('--emb_size', type=int, default=256,
                        help='elmo embeddings size (default: 256)')
    parser.add_argument('--model_name', type=str, default='baseline',
                        help='model name (default: baseline)')
    parser.add_argument('--tunable', action='store_true',
                        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    annotations = pd.read_csv('data/data/annotations_merged.csv')
    prompts = pd.read_csv('data/data/prompts_merged.csv')

    feature_dictionary = {}
    prompts_dictionary = {}

    for index, row in prompts.iterrows():
        prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']]

    for index, row in annotations.iterrows():
        if row['PMCID'] not in feature_dictionary:
            feature_dictionary[row['PMCID']] = []
        feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']]
                                                + prompts_dictionary[row['PromptID']])

    train = []
    valid = []
    test = []

    with open('data/splits/train_article_ids.txt') as train_file:
        for line in train_file:
            train.append(int(line.strip()))

    with open('data/splits/validation_article_ids.txt') as valid_file:
        for line in valid_file:
            valid.append(int(line.strip()))

    with open('data/splits/test_article_ids.txt') as test_file:
        for line in test_file:
            test.append(int(line.strip()))

    elmo_token_indexer = {'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer()}

    reader = EIDatasetReader(elmo_token_indexer, feature_dictionary)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    vocab = Vocabulary.from_instances(train_data + valid_data + test_data)

    urls = [
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_options.json',
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_weights.hdf5'
    ]

    elmo_token_embedding = ElmoTokenEmbedder(urls[0], urls[1], dropout=args.dropout, requires_grad=args.tunable,
                                             projection_dim=args.emb_size)

    word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding}, allow_unmatched_keys=True)

    model = Baseline(word_embeddings, vocab)

    cuda_device = args.cuda_device

    if torch.cuda.is_available():
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('article', 'num_tokens')],
                              padding_noise=0.1)
    iterator.index_with(vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=test_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    test_metrics = evaluate(trainer.model, test_data, iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
Exemple #23
0
if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.Adam(model.parameters(), lr=0.001)

iterator = BucketIterator(batch_size=8, sorting_keys=[("target_tokens", "num_tokens")])
iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=valid_dataset,
                  shuffle=True,
                  patience=5,
                  num_epochs=50,
                  summary_interval=100, # to tensorboard
                  serialization_dir = "./models_saved/SPNet/",
                  num_serialized_models_to_keep = 5,
                  grad_norm=2.0,
                  cuda_device=cuda_device)

print("The training starts, results will be serialized to dir", serialization_dir)
trainer.train()




Exemple #24
0
def main():

    trainFile = "../srcData/trainData.csv"
    validFile = "../srcData/devData.csv"
    testFile = "../srcData/testData.csv"
    trainSeq2SeqFile = data.dataPreparation(trainFile)
    validSeq2SeqFile = data.dataPreparation(validFile)
    testSeq2SeqFile = data.dataPreparation(testFile)
    print (testSeq2SeqFile)
    #TokenIndexer Determines how string tokens gets represented as arrays of indexes in a model
    #SingleIdTokenIndexer = Tokens are single integers
    #TokenCharactersIndexer = Tokens as a list of integers
    # Read a tsvfile with paired instances (source, target)
    reader = Seq2SeqDatasetReader(
        source_tokenizer = WordTokenizer(),
        target_tokenizer = WordTokenizer(), # Defaults to source_tokenizer
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer()} # Defaults to source_token_indexers
    )

    # Each of the dataset is a list of each tokens (source_tokens, target_tokens)
    train_dataset = reader.read(trainSeq2SeqFile)
    validation_dataset = reader.read(validSeq2SeqFile)
    test_dataset = reader.read(testSeq2SeqFile)

    # Finding extra fact2 vocab
    trainExtraVocab = findExtraVocab(train_dataset)
    validExtraVocab = findExtraVocab(validation_dataset)
    testExtraVocab = findExtraVocab(test_dataset)
    finalExtraVocab = list(set(trainExtraVocab+validExtraVocab+testExtraVocab))
    print("length:",len(finalExtraVocab))
    #input()

    #vocab = Vocabulary.from_instances(train_dataset + validation_dataset, min_count={'tokens': 3, 'target_tokens': 3})
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset + test_dataset)
    # Train + Valid = 9703
    # Train + Valid + Test = 10099


    print ("Vocab SIze :",vocab.get_vocab_size('tokens'))

    encEmbedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=ENC_EMBEDDING_DIM)

    # Embedding for tokens since in the dataset creation time it is mentioned tokens
    source_embedder = BasicTextFieldEmbedder({"tokens": encEmbedding})

    encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(ENC_EMBEDDING_DIM,HIDDEN_DIM,batch_first=True,dropout=0.2))


    attention = DotProductAttention()

    max_decoding_steps = 4  # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim = TGT_EMBEDDING_DIM,
                          #target_namespace = 'target_tokens',
                          attention = attention,
                          beam_size = beamSize,
                          use_bleu = True,
                          extra_vocab = finalExtraVocab)
    #Can also specify lr=0.001
    optimizer = optim.Adam(model.parameters())

    # Data Iterator that specify how to batch our dataset
    # Takes data shuffles it and creates fixed sized batches
    #iterator = BasicIterator(batch_size=2)
    #iterator.index_with(vocab)
    # Pads batches wrt max input lengths per batch, sorts dataset wrt the fieldnames and padding keys provided for efficient computations
    iterator = BucketIterator(batch_size=50, sorting_keys=[("source_tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model = model,
                      optimizer = optimizer,
                      iterator = iterator,
                      train_dataset = train_dataset,
                      validation_dataset = validation_dataset,
                      #patience = 3,
                      num_epochs = numEpochs,
                      cuda_device = CUDA_DEVICE)

    trainer.train()
    predictor = SimpleSeq2SeqPredictor(model, reader)

    '''for i in range(2):
        print ("Epoch: {}".format(i))
        trainer.train()

        predictor = SimpleSeq2SeqPredictor(model, reader)


        for instance in itertools.islice(validation_dataset, 10):
            print('SOURCE:', instance.fields['source_tokens'].tokens)
            print('GOLD:', instance.fields['target_tokens'].tokens)
            print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
            """'{'predictions': [[1, 4, 5, 92, 8, 6, 1, 8, 6, 26, 3]], 
             'loss': 5.9835076332092285,
             'class_log_probabilities': [-20.10894012451172],
             'predicted_tokens': ['@@UNKNOWN@@', 'is', 'a', 'type', 'of', 'the', '@@UNKNOWN@@', 'of', 'the', 'sun']}
             """
            print (predictor.predict_instance(instance))
    '''

    outFile = open("output_"+str(HIDDEN_DIM)+"_"+str(numEpochs)+"_"+str(beamSize)+".csv","w")
    writer = csv.writer(outFile,delimiter="\t")
    for instance in itertools.islice(test_dataset,500):
        src = instance.fields['source_tokens'].tokens
        gold = instance.fields['target_tokens'].tokens
        pred = predictor.predict_instance(instance)['predicted_tokens']
        writer.writerow([src,gold,pred])


    outFile.close()
        "mode": "max",
        "factor": 0.5,
        "patience": 5
    })
    lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_params)
    iterator = BasicIterator(batch_size=64)
    iterator.index_with(vocab)

    for (l, train_dataset, validation_dataset, n_classes, num_epochs,
         patience) in [(discourse_dict, discourse_train_dataset,
                        discourse_validation_dataset, 5, 20, 2),
                       (claim_dict, claim_train_dataset,
                        claim_validation_dataset, 2, 20, 2),
                       (discourse_dict, discourse_train_dataset,
                        claim_validation_dataset, 5, 10, 2),
                       (claim_dict, claim_train_dataset,
                        claim_validation_dataset, 2, 20, 2)]:
        model.vocab._token_to_index['labels'] = l
        model.num_classes = n_classes
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          learning_rate_scheduler=lr_scheduler,
                          iterator=iterator,
                          train_dataset=train_dataset,
                          validation_dataset=validation_dataset,
                          patience=patience,
                          num_epochs=num_epochs,
                          cuda_device=0)
        trainer.train()
    # save trained weight
    torch.save(model.state_dict(), './model_alternate_training_crf.th')
def main():
    #Initlizing the embeddings (BERT)
    bert_token_indexer = PretrainedBertIndexer(
        pretrained_model="./biobert_pubmed/vocab.txt",
        max_pieces=config.max_seq_len,
        do_lowercase=True,
    )
    reader = BertAnalogyDatasetReader(
        tokenizer=bert_tokenizer,
        token_indexers={'tokens': bert_token_indexer})

    train_dataset, test_dataset, dev_dataset = (
        reader.read(DATA_ROOT + "/" + fname)
        for fname in ["train_all.txt", "test_all.txt", "val_all.txt"])

    vocab = Vocabulary.from_instances(train_dataset + test_dataset +
                                      dev_dataset)

    bert_embedder = PretrainedBertEmbedder(
        pretrained_model='biobert_pubmed',
        top_layer_only=True,  # conserve memory
    )
    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
        {"tokens": bert_embedder},
        # we'll be ignoring masks so we'll need to set this to True
        allow_unmatched_keys=True)

    BERT_DIM = word_embeddings.get_output_dim()

    class BertSentencePooler(Seq2VecEncoder):
        def forward(self,
                    embs: torch.tensor,
                    mask: torch.tensor = None) -> torch.tensor:
            # extract first token tensor
            return embs[:, 0]

        @overrides
        def get_output_dim(self) -> int:
            return BERT_DIM

    #Initializing the model
    #takes the hidden state at the last time step of the LSTM for every layer as one single output
    bert_encoder = BertSentencePooler(vocab)

    model = LstmModel(word_embeddings, bert_encoder, vocab)
    if USE_GPU: model.cuda()
    else: model

    # Training the model
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      cuda_device=0 if USE_GPU else -1,
                      num_epochs=20)

    trainer.train()

    #Saving the model
    with open("biobert/model.th", 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files("biobert/vocabulary")
    return vocab
Exemple #27
0
def train_only_lee():
    # This is WORKING! 
	# load datasetreader 
    # Save logging to a local file
    # Multitasking
    log.getLogger().addHandler(log.FileHandler(directory+"/log.log"))

    lr = 0.00001
    batch_size = 2
    epochs = 100
    max_seq_len = 512
    max_span_width = 30
    #token_indexer = BertIndexer(pretrained_model="bert-base-uncased", max_pieces=max_seq_len, do_lowercase=True,)
    token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False)
    reader = ConllCorefBertReader(max_span_width = max_span_width, token_indexers = {"tokens": token_indexer})

    EMBEDDING_DIM = 1024
    HIDDEN_DIM = 200
    processed_reader_dir = Path(directory+"processed/")
    
    train_ds = None
    if processed_reader_dir.is_dir():
        print("Loading indexed from checkpoints")
        train_path =  Path(directory +"processed/train_d")
        if train_path.exists():
            train_ds = pickle.load(open(directory + "processed/conll/train_d", "rb"))
            val_ds =  pickle.load(open(directory + "processed/conll/val_d", "rb"))
            test_ds = pickle.load(open(directory + "processed/conll/test_d", "rb"))
        else:
            print("checkpoints not found")
            train_ds, val_ds, test_ds = (reader.read(dataset_folder + fname) for fname in ["train.english.v4_gold_conll", "dev.english.v4_gold_conll", "test.english.v4_gold_conll"])
            pickle.dump(train_ds,open(directory + "processed/train_d", "wb"))
            pickle.dump(val_ds,open(directory + "processed/val_d", "wb"))
            pickle.dump(test_ds,open(directory + "processed/test_d", "wb"))
            print("saved checkpoints")
    # restore checkpoint here

    #vocab = Vocabulary.from_instances(train_ds + val_ds)
    vocab = Vocabulary()
    iterator = BasicIterator(batch_size=batch_size)
    iterator.index_with(vocab)

    val_iterator = BasicIterator(batch_size=batch_size)
    val_iterator.index_with(vocab)
    from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

    bert_embedder = PretrainedBertEmbedder(
             pretrained_model="bert-base-cased",
             top_layer_only=True, # conserve memory
             requires_grad=True
     )
    # here, allow_unmatched_key = True since we dont pass in offsets since 
    #we allow for word embedings of the bert-tokenized, wnot necessiarly the 
    # original tokens
    # see the documetnation for offsets here for more info:
    # https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/bert_token_embedder.py
    word_embedding = BasicTextFieldEmbedder({"tokens": bert_embedder}, allow_unmatched_keys=True)
    BERT_DIM = word_embedding.get_output_dim()
    # at each batch, sample from the two, and load th eLSTM
    shared_layer = torch.nn.LSTM(BERT_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)
    seq2seq = PytorchSeq2SeqWrapper(shared_layer)
    mention_feedforward = FeedForward(input_dim = 2336, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())
    antecedent_feedforward = FeedForward(input_dim = 7776, num_layers = 2, hidden_dims = 150, activations = torch.nn.ReLU())

    model = CoreferenceResolver(vocab=vocab, text_field_embedder=word_embedding,context_layer= seq2seq, mention_feedforward=mention_feedforward,antecedent_feedforward=antecedent_feedforward , feature_size=768,max_span_width=max_span_width,spans_per_word=0.4,max_antecedents=250,lexical_dropout= 0.2)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # and then we can do the shared loss
    # 
    # Get 
    USE_GPU = 0
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        validation_iterator = val_iterator, 
        train_dataset=train_ds,
        validation_dataset = val_ds, 
        validation_metric = "+coref_f1",
        cuda_device=0 if USE_GPU else -1,
        serialization_dir= directory + "saved_models/only_lee",
        num_epochs=epochs,
    )    

    metrics = trainer.train()
    # save the model
    with open(directory + "saved_models/current_run_model_state", 'wb') as f:
        torch.save(model.state_dict(), f)
Exemple #28
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer},
                                                    use_subtrees=True)
    train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=300,
                                    weight=weight,
                                    trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                  hidden_size=512,
                                                  num_layers=2,
                                                  batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
        iterator.index_with(vocab)
        optimizer = optim.Adam(model.parameters())
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_data,
                          validation_dataset=dev_data,
                          num_epochs=5,
                          patience=1,
                          cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda() # rnn cannot do backwards in train mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train() # rnn cannot do backwards in train mode

    # intiialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1):
        # get accuracy with current triggers
        utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
        model.train() # rnn cannot do backwards in train mode

        # get gradient w.r.t. trigger embeddings for current batch
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

        # pass the gradients to a particular attack to generate token candidates for each token.
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40,
                                                        increase_loss=True)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        num_candidates=40,
        #                                                        increase_loss=True)

        # Tries all of the candidates and returns the trigger sequence with highest loss.
        trigger_token_ids = utils.get_best_candidates(model,
                                                      batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    iterator = BasicIterator(batch_size=64)
    iterator.index_with(vocab)

    model.cuda(0)

    print('Start training 1:')
    # unfreeze top layers and train
    for param in list(model.parameters())[:-4]:
        param.requires_grad = False
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      validation_iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=3,
                      num_epochs=100,
                      cuda_device=0)
    trainer.train()

    print('Start training 2:')
    # unfreeze most layers and continue training
    for param in list(model.parameters())[1:]:
        param.requires_grad = True
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      validation_iterator=iterator,
                      train_dataset=train_dataset,
Exemple #30
0
def train_cnn(train_dataset,
              batch_size,
              num_filters,
              filter_sizes,
              epochs=15,
              learning_rate=3e-4,
              num_classes=2,
              use_gpu=False):
    """
    Trains CNN on train_dataset; uses pre-trained ELMo model to dynamically compute embeddings.
    The CNN has one convolution layer for each ngram filter size.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    epochs: int
        total number of epochs to train on (default=15)
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary
    """
    vocab = Vocabulary()
    word_embeddings: TextFieldEmbedder = load_elmo_embeddings()

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)  # numericalize the data

    # CNN encoder
    encoder: Seq2VecEncoder = CnnEncoder(
        embedding_dim=word_embeddings.get_output_dim(),
        num_filters=num_filters,
        ngram_filter_sizes=filter_sizes)

    # Feedforward:
    classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(),
                                                    num_classes)

    model = models.Classifier(vocab=vocab,
                              word_embeddings=word_embeddings,
                              encoder=encoder,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      cuda_device=0 if use_gpu else -1,
                      num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab