Esempio n. 1
0
def train_detector(args, detector, vocab, trapdoor_train, trapdoor_dev=None):
    iterator = BucketIterator(batch_size=args.detector_batch_size,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  detector.parameters()),
                           lr=args.lr_detector)

    train_counts = np.unique([a['label'].label for a in trapdoor_train],
                             return_counts=True)[1]
    dev_counts = np.unique([a['label'].label for a in trapdoor_dev],
                           return_counts=True)[1]
    print("Distribution of detector train", train_counts)
    print("Distribution of detector dev", dev_counts)

    class_weight = torch.from_numpy(np.max(train_counts) /
                                    train_counts).cuda().float()
    detector.set_class_weight(class_weight)

    trainer = Trainer(model=detector,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=trapdoor_train,
                      validation_dataset=trapdoor_dev,
                      num_epochs=args.detector_epochs,
                      patience=args.detector_patience,
                      cuda_device=0)
    trainer.train()
Esempio n. 2
0
 def train_and_save():
     model = LstmTagger(word_embeddings, lstm, vocab)
     if cuda_device >= 0:
         model = model.cuda(cuda_device)
     optimizer = optim.SGD(model.parameters(), lr=0.1)
     iterator = BucketIterator(batch_size=2,
                               sorting_keys=[("sentence", "num_tokens")])
     iterator.index_with(vocab)
     trainer = Trainer(model=model,
                       optimizer=optimizer,
                       iterator=iterator,
                       train_dataset=train_dataset,
                       validation_dataset=validation_dataset,
                       patience=10,
                       num_epochs=500,
                       cuda_device=cuda_device)
     trainer.train()
     predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
     tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
     tag_ids = np.argmax(tag_logits, axis=-1)
     print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])
     # Here's how to save the model.
     with open("./tmp/model.th", 'wb') as f:
         torch.save(model.state_dict(), f)
     vocab.save_to_files("./tmp/vocabulary")
     return tag_logits
Esempio n. 3
0
    def test_forward(self):
        lr = 0.5
        batch_size = 16
        embedding_dim = 50

        squad_reader = SquadReader()
        # Read SQuAD train set (use the test set, since it's smaller)
        train_dataset = squad_reader.read(self.squad_test)
        vocab = Vocabulary.from_instances(train_dataset)

        # Random embeddings for test
        test_embed_matrix = torch.rand(vocab.get_vocab_size(), embedding_dim)
        test_cbow = CBOW(test_embed_matrix)
        optimizer = optim.Adadelta(filter(lambda p: p.requires_grad,
                                          test_cbow.parameters()),
                                   lr=lr)

        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("passage", "num_tokens"),
                                                ("question", "num_tokens")])
        iterator.index_with(vocab)
        for batch in iterator(train_dataset, num_epochs=1):
            passage = batch["passage"]["tokens"]
            question = batch["question"]["tokens"]
            span_start = batch["span_start"]
            span_end = batch["span_end"]
            output_dict = test_cbow(passage, question)
            softmax_start_logits = output_dict["softmax_start_logits"]
            softmax_end_logits = output_dict["softmax_end_logits"]
            loss = nll_loss(softmax_start_logits, span_start.view(-1))
            loss += nll_loss(softmax_end_logits, span_end.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
def main():
    reader = TatoebaSentenceReader()
    train_set = reader.read('data/mt/sentences.top10langs.train.tsv')
    dev_set = reader.read('data/mt/sentences.top10langs.dev.tsv')

    vocab = Vocabulary.from_instances(train_set,
                                      min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    positive_label = vocab.get_token_index('eng', namespace='labels')
    model = LstmClassifier(word_embeddings, encoder, vocab, positive_label=positive_label)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_set,
                      validation_dataset=dev_set,
                      num_epochs=3)

    trainer.train()
Esempio n. 5
0
def train_model(parameters, name):
    token_indexer = {
        "tokens": ELMoTokenCharactersIndexer()
    } if parameters['use_elmo'] else None
    reader = SSJ500KReader(
        token_indexer) if parameters["dataset"] == "ssj" else SentiCorefReader(
            token_indexer)
    train_dataset = reader.read("train")
    validation_dataset = reader.read("test")
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
    # vocab = Vocabulary() if parameters['use_elmo'] else Vocabulary.from_instances(train_dataset + validation_dataset)
    model = get_model(vocab, parameters)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    optimizer = optim.Adam(model.parameters(),
                           lr=parameters['lr'],
                           weight_decay=parameters['weight_decay'])
    iterator = BucketIterator(batch_size=parameters['batch_size'],
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=parameters['patience'],
                      num_epochs=parameters['num_epochs'],
                      cuda_device=cuda_device)
    trainer.train()
    metrics = evaluate(model, validation_dataset, iterator, cuda_device, None)
    save_model_and_vocab(model, vocab, metrics, parameters, fname=name)
def main():
    reader = LinzenDatasetReader(append_null=False)
    vocab = Vocabulary.from_files("saved_models/vocabulary")

    stack = StackRNNAgreementPredictor(vocab,
                                       rnn_dim=100,
                                       rnn_cell_type=torch.nn.GRUCell)
    stack.load_state_dict(torch.load("saved_models/stack-linzen.th"))

    lstm = SimpleRNNAgreementPredictor(vocab,
                                       rnn_dim=18,
                                       rnn_type=torch.nn.GRU)
    lstm.load_state_dict(torch.load("saved_models/lstm-linzen.th"))

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    dataset = reader.read("StackNN/data/linzen/rnn_agr_simple/numpred.test")
    stack_metrics = evaluate(stack, dataset, iterator, -1, "")
    lstm_metrics = evaluate(stack, dataset, iterator, -1, "")
    print(stack_metrics)
    print(lstm_metrics)

    for i in range(6):
        dataset = reader.read(
            "StackNN/data/linzen/rnn_agr_simple/numpred.test." + str(i))
        stack_metrics = evaluate(stack, dataset, iterator, -1, "")
        lstm_metrics = evaluate(lstm, dataset, iterator, -1, "")
        print(stack_metrics)
        print(lstm_metrics)
Esempio n. 7
0
def train_model(args,
                model,
                vocab,
                train_data,
                dev_data=None,
                epochs=None,
                weight_balance=True):
    if weight_balance:
        train_counts = np.unique([a['label'].label for a in train_data],
                                 return_counts=True)[1]
        class_weight = torch.from_numpy(np.max(train_counts) /
                                        train_counts).cuda().float()
        model.set_class_weight(class_weight)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      num_epochs=epochs if epochs else args.epochs,
                      patience=args.patience,
                      cuda_device=0)
    trainer.train()
Esempio n. 8
0
def evaluate(model: Model, reader: readers.BaseReader,
             test_data: List[Instance]) -> None:
    visualise_model(model)

    vocab = Vocabulary.from_instances(test_data)
    iterator = BucketIterator(batch_size=ARGS.BATCH_SIZE,
                              sorting_keys=reader.keys)
    # Our data should be indexed using the vocabulary we learned.
    iterator.index_with(vocab)

    data_types = split_list(test_data)
    results: Dict[str, Tuple[int, float]] = {}

    model.eval()

    print()
    print('#' * 5, 'PER TYPE EVALUATION', '#' * 5)
    for qtype, data in data_types.items():
        num_items = len(data)
        print(f'Type: {qtype} ({num_items})')

        metrics = allen_eval(model, data, iterator, ARGS.CUDA_DEVICE, "")
        print()

        accuracy = metrics['accuracy']
        results[qtype] = (num_items, accuracy)
Esempio n. 9
0
class Seq2SeqPredictor:

    def __init__(self, model: Model,
                 data_reader: SummDataReader,
                 batch_size: int,
                 cuda_device: int):
        self.cuda_device = cuda_device
        self.iterator = BucketIterator(batch_size=batch_size,
                                       sorting_keys=[("source_tokens", "num_tokens")])
        self.model = model
        self.data_reader = data_reader

    def _extract_data(self, batch) -> numpy.ndarray:
        out_dict = self.model(**batch)
        return out_dict

    def predict(self, file_path: str, vocab_path: str):
        ds = self.data_reader.read(file_path)
        vocab = Vocabulary.from_files(vocab_path)
        self.iterator.index_with(vocab)
        self.model.eval()
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return preds
Esempio n. 10
0
def running_whole_model():
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("sentence", "num_tokens"),
                                                                   ("structures1", "num_tokens"),
                                                                   ("structures2", "num_tokens"),
                                                                   ("structures3", "num_tokens")])
    iterator.index_with(vocab)


    model = All_generating(embed_size=EMBEDDING_DIM,
                           word_embeddings=word_embeddings,
                           vocab=vocab,
                           num_of_candidates=7,
                           )

    # optimizer = adabound.AdaBound(model.parameters(), lr=lr, final_lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=lr)


    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=whole_train_dataset,
                      validation_dataset=whole_validation_dataset,
                      patience=5,
                      num_epochs=30)
    trainer.train()
Esempio n. 11
0
 def test_model_forward(self):
     iterator = BucketIterator(sorting_keys=[("question", "num_tokens")],
                               padding_noise=0.0,
                               batch_size=5)
     iterator.index_with(vocab=self.vocab)
     batch = next(iterator(self.sample_instances, shuffle=False))
     self.check_model_computes_gradients_correctly(self.model, batch)
Esempio n. 12
0
def main():
    reader = LinzenDatasetReader(append_null=False)
    train_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.train")
    validation_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.val")
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    model = StackRNNAgreementPredictor(vocab,
                                       rnn_dim=100,
                                       rnn_cell_type=torch.nn.GRUCell)
    # model = SimpleRNNAgreementPredictor(
    #     vocab, rnn_dim=18, rnn_type=torch.nn.GRU)

    optimizer = torch.optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=16,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=5)
    trainer.train()

    with open("/tmp/model.th", "wb") as fh:
        torch.save(model.state_dict(), fh)
    vocab.save_to_files("/tmp/vocabulary")
Esempio n. 13
0
def pre_processing(train_data):
    vocab = Vocabulary()
    iterator = BucketIterator(batch_size=config.batch_size,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    batch = next(iter(iterator(train_data)))

    bert_embedder = PretrainedBertEmbedder(
        pretrained_model="bert-base-uncased", top_layer_only=True)
    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder(
        {"tokens": bert_embedder}, allow_unmatched_keys=True)
    bert_dim = word_embeddings.get_output_dim()

    class BertSentencePooler(Seq2VecEncoder):
        def forward(self,
                    embs: torch.tensor,
                    mask: torch.tensor = None) -> torch.tensor:
            return embs[:, 0]

        @overrides
        def get_output_dim(self) -> int:
            return bert_dim

    encoder = BertSentencePooler(vocab)
    model = BaselineModel(word_embeddings, encoder, vocab)

    return model, batch, vocab, iterator
Esempio n. 14
0
def prepare2(model, vocab, train_dataset, validation_dataset, cuda_device, reader):
    """
    Second part of preparing data for training
    :param model: biLSTM model object
    :param vocab: biLSTM vocabulary
    :param train_dataset: data for training
    :param validation_dataset: data for validation
    :param cuda_device: cuda biLSTM object
    :param reader: biLSTM reader object
    :return: trainer biLSRM obejct, biLSTM model obkect, biLSTM reader object and biLSTM vocabulary
    """
    optimizer = optim.SGD(model.parameters(), lr=0.3)
    iterator = BucketIterator(batch_size=1, sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      #patience=1,
                      patience=10,
                      #num_epochs=2,
                      num_epochs=1000,
                      cuda_device=cuda_device)

    return trainer, model, reader, vocab
Esempio n. 15
0
    def test_on_test_set(self):
        # model_weight_file = os.path.join(os.path.dirname(__file__),  '..', "output", "201905290138", "weights_best.th")
        # vocab_dir_path = os.path.join(os.path.dirname(__file__),  '..', "output", "201905290138", "vocabulary")
        model_weight_file = "C:\\Data\\rumourDNN_models\\output\\bostonbombings-201906241245\\weights_best.th"
        vocab_dir_path = "C:\\Data\\rumourDNN_models\\output\\bostonbombings-201906241245\\vocabulary"

        model, rumor_dnn_predictor = load_classifier_from_archive(
            vocab_dir_path=vocab_dir_path, model_weight_file=model_weight_file)

        evaluation_data_path = os.path.join(os.path.dirname(__file__), '..',
                                            "data", "test", "charliehebdo.csv")

        elmo_token_indexer = ELMoTokenCharactersIndexer()
        rumor_train_set_reader = RumorTweetsDataReader(
            token_indexers={'elmo': elmo_token_indexer})
        test_instances = rumor_train_set_reader.read(evaluation_data_path)

        from training_util import evaluate
        data_iterator = BucketIterator(batch_size=200,
                                       sorting_keys=[("sentence", "num_tokens")
                                                     ])
        data_iterator.index_with(model.vocab)
        metrics = evaluate(model, test_instances, data_iterator, -1, "")

        timestamped_print("Finished evaluating.")
        timestamped_print("Metrics:")
        for key, metric in metrics.items():
            timestamped_print("%s: %s" % (key, metric))
Esempio n. 16
0
 def test_guess_sorting_key_picks_the_longest_key(self):
     iterator = BucketIterator(batch_size=2, padding_noise=0)
     iterator.index_with(self.vocab)
     instances = []
     short_tokens = [Token(t) for t in ["what", "is", "this", "?"]]
     long_tokens = [Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"]]
     instances.append(
         Instance(
             {
                 "question": TextField(short_tokens, self.token_indexers),
                 "passage": TextField(long_tokens, self.token_indexers),
             }
         )
     )
     instances.append(
         Instance(
             {
                 "question": TextField(short_tokens, self.token_indexers),
                 "passage": TextField(long_tokens, self.token_indexers),
             }
         )
     )
     instances.append(
         Instance(
             {
                 "question": TextField(short_tokens, self.token_indexers),
                 "passage": TextField(long_tokens, self.token_indexers),
             }
         )
     )
     assert iterator._sorting_keys is None
     iterator._guess_sorting_keys(instances)
     assert iterator._sorting_keys == [("passage", "tokens___tokens")]
Esempio n. 17
0
    def test(self, data_path, log_path=None, batch=1, log_violation=False):
        # prepare GPU
        if torch.cuda.is_available() and not DEBUG_TRAINING:
            device = 0
            self.model.cuda()
        else:
            device = -1

        reader = self.reader
        dataset = reader.read(data_path, metas={'dataset_type':'test'})

        # prepare iterator
        sentence_sensors = self.get_sensors(SentenceEmbedderLearner)
        sorting_keys = [(sensor.fullname, 'num_tokens') for name, sensor in sentence_sensors]
        iterator = BucketIterator(batch_size=batch,
                                  sorting_keys=sorting_keys,
                                  track_epoch=True)
        iterator.index_with(self.model.vocab)

        # prepare model
        training_state = self.model.training
        self.model.eval()

        self.solver_log_to(log_path)
        metrics = evaluate(model=self.model,
                           instances=dataset,
                           data_iterator=iterator,
                           cuda_device=device,
                           batch_weight_key=None,
                           log_violation=log_violation)

        # restore model
        self.model.train(training_state)

        return metrics
Esempio n. 18
0
def main():
    reader = LanguageModelingReader()
    train_dataset = reader.read('data/mt/sentences.eng.10k.txt')

    # for inst in train_dataset:
    #     print(inst)

    vocab = Vocabulary.from_instances(train_dataset, min_count={'tokens': 5})

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("input_tokens", "num_tokens")])

    iterator.index_with(vocab)

    model = RNNLanguageModel(vocab, cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      patience=10,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)

    trainer.train()

    print(model.generate())
    print(model.generate())
    print(model.generate())
    print(model.generate())
    print(model.generate())
Esempio n. 19
0
def main():
    reader = MarvinLinzenLMDatasetReader(append_null=False)
    train_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.train")
    validation_dataset = reader.read(
        "StackNN/data/linzen/rnn_agr_simple/numpred.val")
    vocab = Vocabulary.from_files("saved_models/vocabulary_brown")

    model = StackRNNLanguageModel(vocab,
                                  rnn_dim=100,
                                  rnn_cell_type=torch.nn.GRUCell)
    model.load_state_dict(torch.load("saved_models/stack-brown.th"))

    optimizer = torch.optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=16,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=5)
    trainer.train()

    with open("/tmp/model.th", "wb") as fh:
        torch.save(model.state_dict(), fh)
    vocab.save_to_files("/tmp/vocabulary")
def evaluate(model, vocab, test_dataset):
    iterator = BucketIterator(batch_size=len(test_dataset),
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    val_generator = iterator(test_dataset)
    model(**next(val_generator))
    return model.get_metrics(reset=True)
Esempio n. 21
0
def running_NER():
    reader = PosDatasetReader()
    train_dataset = reader.read('../data/700_multi_data/600_ner_train.txt')
    validation_dataset = reader.read('../data/700_multi_data/66_ner_test.txt')

    vocab = Vocabulary.from_files("../model_store/vocabulary")

    # '''vocab part'''
    # train_1 = reader.read('../data/train/train.json')
    # train_2 = reader.read('../data/train/dev.json')

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=1000)
    trainer.train()
Esempio n. 22
0
def trainModel(train_dataset, validation_dataset, vocab):
    EMBEDDING_DIM = 6
    HIDDEN_DIM = 6
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, bidirectional=False, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    # optimizer = optim.AdamW(model.parameters(), lr=1e-4, eps=1e-8)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=100,
                      cuda_device=cuda_device)
    trainer.train()
    return model
Esempio n. 23
0
def load_SQUAD1_dataset(cf_a, vocab):
    """
    Loads the dataset and creates iterators and so on
    """
    ## Create the Data Reader with the Tokenization and indexing
    if (cf_a.datareader_lazy):
        #If we do lazy loading, the training will be slower but we dont have RAM so....
        # We also can specify:
        instances_per_epoch_train = cf_a.instances_per_epoch_train
        instances_per_epoch_validation = cf_a.instances_per_epoch_validation
        max_instances_in_memory = cf_a.max_instances_in_memory
    else:
        instances_per_epoch_train = None
        instances_per_epoch_validation = None
        max_instances_in_memory = None

    ## Instantiate the datareader
    squad_reader = Squad1Reader(
        lazy=cf_a.datareader_lazy,
        tokenizer_indexer_type=cf_a.tokenizer_indexer_type)

    ## Load the datasets
    train_dataset = squad_reader.read(file_path=cf_a.train_squad1_file)
    validation_dataset = squad_reader.read(
        file_path=cf_a.validation_squad1_file)
    """
    ########################## ITERATORS  ############################
    Iterator that will get the samples for the problem
    """

    if (cf_a.datareader_lazy == False):
        instances_per_epoch_train = len(train_dataset)
        instances_per_epoch_validation = len(validation_dataset)

    train_iterator = BucketIterator(
        batch_size=cf_a.batch_size_train,
        instances_per_epoch=instances_per_epoch_train,
        max_instances_in_memory=max_instances_in_memory,
        sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]])
    train_iterator.index_with(vocab)

    validation_iterator = BucketIterator(
        batch_size=cf_a.batch_size_validation,
        instances_per_epoch=instances_per_epoch_validation,
        max_instances_in_memory=max_instances_in_memory,
        sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]])

    validation_iterator.index_with(vocab)

    num_batches = int(
        np.ceil(instances_per_epoch_train / cf_a.batch_size_train))
    num_batches_validation = int(
        np.ceil(instances_per_epoch_validation / cf_a.batch_size_validation))

    # Create the iterator over the data:
    train_iterable = train_iterator(train_dataset)
    validation_iterable = validation_iterator(validation_dataset)

    return squad_reader, num_batches, train_iterable, num_batches_validation, validation_iterable
def evaluate_all_tasks(task, evaluate_tasks, dev_data, vocabulary, model, args,
                       save_weight, temps):
    devicea = -1
    if torch.cuda.is_available():
        devicea = 0
    majority = {
        'subjectivity': 0.5,
        'sst': 0.2534059946,
        'trec': 0.188,
        'cola': 0,
        'ag': 0.25,
        'sst_2c': 0.51
    }

    sota = {
        'subjectivity': 0.955,
        'sst': 0.547,
        'trec': 0.9807,
        'cola': 0.341,
        'ag': 0.955,
        'sst_2c': 0.968
    }

    overall_metric = {}
    standard_metric = {}
    for j in evaluate_tasks:
        model.set_task(j, tmp=temps[j])
        print("\nEvaluating ", j)
        sys.stdout.flush()
        iterator1 = BucketIterator(batch_size=args.bs,
                                   sorting_keys=[("tokens", "num_tokens")])
        iterator1.index_with(vocabulary[j])
        metric = evaluate(model=model,
                          instances=dev_data[j],
                          data_iterator=iterator1,
                          cuda_device=devicea,
                          batch_weight_key=None)

        # Take first 500 instances for evaluating activations.
        if not args.no_save_weight:
            iterator1 = BucketIterator(batch_size=500,
                                       sorting_keys=[("tokens", "num_tokens")])
            iterator1.index_with(vocabulary[j])
            evaluate(model=model,
                     instances=dev_data[j][:500],
                     data_iterator=iterator1,
                     cuda_device=devicea,
                     batch_weight_key=None)
            save_weight.add_activations(model, task, j)

        if j == 'cola':
            metric['metric'] = metric['average']
        else:
            metric['metric'] = metric['accuracy']
        smetric = (float(metric['metric']) - majority[j]) / (sota[j] -
                                                             majority[j])
        overall_metric[j] = metric
        standard_metric[j] = smetric
    return overall_metric, standard_metric
Esempio n. 25
0
def train(
    model: Model,
    binary_class: str,
    train_data: DatasetType,
    valid_reader: DatasetReader,
    vocab: Vocabulary,
    optimizer_type: str,
    optimizer_learning_rate: float,
    optimizer_weight_decay: float,
    batch_size: int,
    patience: int,
    num_epochs: int,
    device: str,
) -> Tuple[Model, MetricsType]:
    train_reader = BIODatasetReader(
        ActiveBIODataset(train_data, dataset_id=0, binary_class=binary_class),
        token_indexers={
            'tokens': ELMoTokenCharactersIndexer(),
        },
    )

    train_dataset = train_reader.read('tmp.txt')
    valid_dataset = valid_reader.read('tmp.txt')

    cuda_device = -1

    if device == 'cuda':
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = optim.SGD(
        model.parameters(),
        lr=optimizer_learning_rate,
        weight_decay=optimizer_weight_decay,
    )

    iterator = BucketIterator(
        batch_size=batch_size,
        sorting_keys=[("sentence", "num_tokens")],
    )

    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        patience=patience,
        num_epochs=num_epochs,
        cuda_device=cuda_device,
        validation_metric='f1-measure-overall',
    )
    metrics = trainer.train()

    return model, metrics
Esempio n. 26
0
 def test_create_batches_groups_correctly(self):
     iterator = BucketIterator(batch_size=2, padding_noise=0, sorting_keys=[('text', 'num_tokens')])
     iterator.index_with(self.vocab)
     batches = list(iterator._create_batches(self.instances, shuffle=False))
     grouped_instances = [batch.instances for batch in batches]
     assert grouped_instances == [[self.instances[4], self.instances[2]],
                                  [self.instances[0], self.instances[1]],
                                  [self.instances[3]]]
Esempio n. 27
0
def train(model_args):
    model_name = model_args.serialization_dir
    checkpoint_dir = model_args.store_folder
    learning_rate = model_args.learning_rate
    rl_basic = model_args.rl_basic
    pretrain_folder = ''

    if checkpoint_dir == 'pretrain':
        is_pretrain = True
    else:
        # check if rl_basic is specified
        pretrain_folder = os.path.join('pretrain', rl_basic)
        if not os.path.exists(pretrain_folder):
            raise FileNotFoundError(f'Can not find the pretrained model {pretrain_folder}!')
        is_pretrain = False

    reader = construct_reader(is_pretrain=is_pretrain)

    train_dataset = reader.read("data_processed\\train.jsonl")
    test_dataset = reader.read("data_processed\\test.jsonl")

    # build vocabulary
    vocab = Vocabulary.from_instances(train_dataset + test_dataset)

    # build model and move it into cuda
    model = construct_model(vocab, model_args)
    model.cuda()

    # allocate
    optimizer = optim.Adam(model.parameters(), weight_decay=1e-5, lr=learning_rate)
    scheduler = construct_learning_scheduler(optimizer)

    iterator = BucketIterator(batch_size=2, sorting_keys=[("prev_tokens", "num_tokens")])
    iterator.index_with(vocab)

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # not recover from previous state, we should load the pretrain model as default.
    if not is_pretrain and not os.path.exists(os.path.join(checkpoint_dir, model_name, "best.th")):
        model_state = torch.load(os.path.join(pretrain_folder, "best.th"))
        model.load_state_dict(model_state)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=test_dataset,
                      learning_rate_scheduler=scheduler,
                      patience=model_args.patience,
                      validation_metric="+{}".format(model_args.validation_metric),
                      num_epochs=model_args.epoch,
                      serialization_dir=os.path.join(checkpoint_dir, model_name),
                      cuda_device=0,
                      should_log_learning_rate=True)

    trainer.train()
    return model_name
Esempio n. 28
0
def get_accuracy(model,
                 dev_dataset,
                 vocab,
                 trigger_token_ids=None,
                 snli=False,
                 reset_metric=True):
    """
    When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with
    triggers prepended for the whole dev_dataset.
    """
    if reset_metric:
        model.get_metrics(reset=True)
    model.eval()  # model should be in eval() already, but just in case
    if snli:
        iterator = BucketIterator(batch_size=128,
                                  sorting_keys=[("premise", "num_tokens")])
    else:
        iterator = BucketIterator(batch_size=128,
                                  sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    logits = []
    labels = []
    print_string = []
    if trigger_token_ids:
        for idx in trigger_token_ids:
            print_string += [vocab.get_token_from_index(idx)]

    for batch in lazy_groups_of(iterator(dev_dataset,
                                         num_epochs=1,
                                         shuffle=False),
                                group_size=1):
        output = evaluate_batch(model, batch, trigger_token_ids, snli)
        logits.append(output['logits'].detach().cpu().numpy())
        labels.append(output['labels'].detach().cpu().numpy())

    logits = np.concatenate(logits, 0)
    labels = np.concatenate(labels, 0)
    preds_int = np.argmax(logits, 1)
    success_idx = np.where(labels != preds_int)[0]
    acc = accuracy_score(labels, preds_int)
    if len(np.unique(labels)) > 1:
        f1_weighted = f1_score(labels, preds_int, average="weighted")
        try:
            f1 = f1_score(labels, preds_int)
        except:
            f1 = f1_weighted
    else:
        f1 = 'N/A'
        f1_weighted = 'N/A'

    try:
        auc = roc_auc_score(labels, preds_int)
        auc = "{:.4f}".format(auc)
    except:
        auc = "N/A"

    return acc, auc, f1, f1_weighted, success_idx
Esempio n. 29
0
def main():
    elmo_token_indexer = ELMoTokenCharactersIndexer()

    reader = StanfordSentimentTreeBankDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})

    train_dataset = reader.read(
        'data/stanfordSentimentTreebank/trees/train.txt')
    dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

    # You can optionally specify the minimum count of tokens/labels.
    # `min_count={'tokens':3}` here means that any tokens that appear less than three times
    # will be ignored and not included in the vocabulary.
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})

    # Use the 'Small' pre-trained model
    options_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    )
    weight_file = (
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
        '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    )

    elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

    # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens,
    # not for labels, which are used as-is as the "answer" of the sentence classification
    word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
    # (usually a sequence of embedded word vectors), processes it, and returns a single
    # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but
    # AllenNLP also supports CNNs and other simple architectures (for example,
    # just averaging over the input vectors).
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=20)

    trainer.train()
Esempio n. 30
0
def main():
    reader = UniversalDependenciesDatasetReader()
    train_dataset = reader.read(
        'data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-train.conllu')
    dev_dataset = reader.read(
        'data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-dev.conllu')

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_SIZE)

    lstm = torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True)

    inner_model = LstmTaggerInnerModel(encoder=lstm,
                                       embedding=token_embedding,
                                       encoder_output_size=HIDDEN_SIZE,
                                       label_size=vocab.get_vocab_size('pos'))
    model = LstmTagger(inner_model, vocab)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=16,
                              sorting_keys=[("words", "num_tokens")],
                              padding_noise=0.)

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=10)
    trainer.train()

    # Run predictor for a sample sentence
    predictor = UniversalPOSPredictor(model, reader)
    logits = predictor.predict(['Time', 'flies', 'like', 'an', 'arrow',
                                '.'])['tag_logits']
    tag_ids = np.argmax(logits, axis=-1)

    print([vocab.get_token_from_index(tag_id, 'pos') for tag_id in tag_ids])

    # Export the inner_model as the ONNX format
    out_dir = 'examples/pos'
    dummy_input = torch.zeros(1, MAX_LEN, dtype=torch.long)
    dummy_mask = torch.ones(1, MAX_LEN, dtype=torch.long)
    inner_model.exporting = True
    torch.onnx.export(model=inner_model,
                      args=(dummy_input, dummy_mask),
                      f=f'{out_dir}/model.onnx',
                      verbose=True)

    vocab.save_to_files(f'{out_dir}/vocab')
Esempio n. 31
0
 def test_create_batches_groups_correctly(self):
     iterator = BucketIterator(batch_size=2,
                               padding_noise=0,
                               sorting_keys=[('text', 'num_tokens')])
     iterator.index_with(self.vocab)
     batches = list(iterator._create_batches(self.instances, shuffle=False))
     grouped_instances = [batch.instances for batch in batches]
     assert grouped_instances == [[self.instances[4], self.instances[2]],
                                  [self.instances[0], self.instances[1]],
                                  [self.instances[3]]]
def get_training_values (model, vocab, train_dataset, validation_dataset,
                         tr_data_loss, val_data_loss, KL_loss,final_loss_tr, final_loss_val, batch_size=100):
    model.eval()
    model.set_posterior_mean(True)

    data_loss_validation = 0
    data_loss_train = 0
    loss_validation = 0
    loss_train = 0
    
    # Create own iterators for this:
    iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("text_field", "num_tokens")])
    iterator.index_with(vocab)
    
    iterator_validation = BucketIterator(batch_size = batch_size, sorting_keys=[("text_field", "num_tokens")])
    iterator_validation.index_with(vocab)
    
    num_batches = int(np.floor(len(train_dataset)/batch_size))
    num_batches_validation = int(np.floor(len(validation_dataset)/batch_size_validation))
    	# Create the iterator over the data:
    batches_iterable = iterator(train_dataset)
    batches_iterable_validation = iterator(validation_dataset)

    # Compute the validation accuracy by using all the Validation dataset but in batches.
    for j in range(num_batches_validation):
        batch = next(batches_iterable_validation)
        tensor_dict = batch # Already converted
        data_loss_validation += model.get_data_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
        loss_validation += model.get_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
 
    data_loss_validation = data_loss_validation/num_batches_validation
    loss_validation = loss_validation/num_batches_validation
    
    ## Same for training
    for j in range(num_batches):
        batch = next(batches_iterable)
        tensor_dict = batch # Already converted
        data_loss_train += model.get_data_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
        loss_train += model.get_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
    
    data_loss_train = data_loss_train/num_batches
    loss_train = loss_train/num_batches
    
    tr_data_loss.append(data_loss_train)
    val_data_loss.append(data_loss_validation)
    KL_loss.append(-model.get_KL_loss())
    final_loss_tr.append(loss_train)
    final_loss_val.append(loss_validation)

    model.train()
    model.set_posterior_mean(False)
Esempio n. 33
0
def load_SQUAD1_dataset(cf_a,vocab):
    """
    Loads the dataset and creates iterators and so on
    """
    ## Create the Data Reader with the Tokenization and indexing
    if (cf_a.datareader_lazy):
        #If we do lazy loading, the training will be slower but we dont have RAM so....
        # We also can specify:
        instances_per_epoch_train = cf_a.instances_per_epoch_train
        instances_per_epoch_validation = cf_a.instances_per_epoch_validation
        max_instances_in_memory = cf_a.max_instances_in_memory 
    else:
        instances_per_epoch_train = None
        instances_per_epoch_validation = None
        max_instances_in_memory = None
    
    ## Instantiate the datareader
    squad_reader = Squad1Reader(lazy = cf_a.datareader_lazy, 
                                tokenizer_indexer_type = cf_a.tokenizer_indexer_type)
    
    ## Load the datasets
    train_dataset = squad_reader.read(file_path = cf_a.train_squad1_file)
    validation_dataset =  squad_reader.read(file_path = cf_a.validation_squad1_file)
    """
    ########################## ITERATORS  ############################
    Iterator that will get the samples for the problem
    """

    if(cf_a.datareader_lazy == False):
        instances_per_epoch_train = len(train_dataset)
        instances_per_epoch_validation = len(validation_dataset)
    
    train_iterator = BucketIterator(batch_size= cf_a.batch_size_train, instances_per_epoch = instances_per_epoch_train,
                              max_instances_in_memory = max_instances_in_memory,
                              sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]])
    train_iterator.index_with(vocab)
    
    validation_iterator = BucketIterator(batch_size= cf_a.batch_size_validation, instances_per_epoch = instances_per_epoch_validation,
                              max_instances_in_memory = max_instances_in_memory,
                              sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]])
    
    validation_iterator.index_with(vocab)
    
    num_batches = int(np.ceil(instances_per_epoch_train/cf_a.batch_size_train))
    num_batches_validation = int(np.ceil(instances_per_epoch_validation/cf_a.batch_size_validation))
    
    # Create the iterator over the data:
    train_iterable = train_iterator(train_dataset)
    validation_iterable = validation_iterator(validation_dataset)
    
    return squad_reader, num_batches, train_iterable, num_batches_validation, validation_iterable
Esempio n. 34
0
 def test_create_batches_groups_correctly_with_max_instances(self):
     # If we knew all the instances, the correct order is 4 -> 2 -> 0 -> 1 -> 3.
     # Here max_instances_in_memory is 3, so we load instances [0, 1, 2]
     # and then bucket them by size into batches of size 2 to get [2, 0] -> [1].
     # Then we load the remaining instances and bucket them by size to get [4, 3].
     iterator = BucketIterator(batch_size=2,
                               padding_noise=0,
                               sorting_keys=[('text', 'num_tokens')],
                               max_instances_in_memory=3)
     iterator.index_with(self.vocab)
     for test_instances in (self.instances, self.lazy_instances):
         batches = list(iterator._create_batches(test_instances, shuffle=False))
         grouped_instances = [batch.instances for batch in batches]
         assert grouped_instances == [[self.instances[2], self.instances[0]],
                                      [self.instances[1]],
                                      [self.instances[4], self.instances[3]]]
Esempio n. 35
0
    def test_bucket_iterator_maximum_samples_per_batch(self):
        iterator = BucketIterator(
                batch_size=3,
                padding_noise=0,
                sorting_keys=[('text', 'num_tokens')],
                maximum_samples_per_batch=['num_tokens', 9]
        )
        iterator.index_with(self.vocab)
        batches = list(iterator._create_batches(self.instances, shuffle=False))
        stats = self.get_batches_stats(batches)

        # ensure all instances are in a batch
        assert stats['total_instances'] == len(self.instances)

        # ensure correct batch sizes
        assert stats['batch_lengths'] == [2, 2, 1]

        # ensure correct sample sizes (<= 9)
        assert stats['sample_sizes'] == [6, 8, 9]
Esempio n. 36
0
    def test_bucket_iterator_maximum_samples_per_batch(self):
        iterator = BucketIterator(
                batch_size=3, padding_noise=0,
                sorting_keys=[('text', 'num_tokens')],
                maximum_samples_per_batch=['num_tokens', 9]
        )
        iterator.index_with(self.vocab)
        batches = list(iterator._create_batches(self.instances, shuffle=False))

        # ensure all instances are in a batch
        grouped_instances = [batch.instances for batch in batches]
        num_instances = sum(len(group) for group in grouped_instances)
        assert num_instances == len(self.instances)

        # ensure all batches are sufficiently small
        for batch in batches:
            batch_sequence_length = max(
                    [instance.get_padding_lengths()['text']['num_tokens']
                     for instance in batch.instances]
            )
            assert batch_sequence_length * len(batch.instances) <= 9
Esempio n. 37
0
    def test_maximum_samples_per_batch_packs_tightly(self):
        token_counts = [10, 4, 3]
        test_instances = self.create_instances_from_token_counts(token_counts)

        iterator = BucketIterator(
                batch_size=3,
                padding_noise=0,
                sorting_keys=[('text', 'num_tokens')],
                maximum_samples_per_batch=['num_tokens', 11]
        )
        iterator.index_with(self.vocab)
        batches = list(iterator._create_batches(test_instances, shuffle=False))
        stats = self.get_batches_stats(batches)

        # ensure all instances are in a batch
        assert stats['total_instances'] == len(test_instances)

        # ensure correct batch sizes
        assert stats['batch_lengths'] == [2, 1]

        # ensure correct sample sizes (<= 11)
        assert stats['sample_sizes'] == [8, 10]
############### Instantiate the model and optimizer ##################
"""

model = Ncut.NameCountryModel(cf_a, vocab)
optimizer = optim.SGD(model.parameters(), lr=0.01)
cf_a.optimizer = optimizer

model.to(device = device, dtype = dtype)
"""
############ Iterator that will get the samples for the problem #############
"""
batch_size=10
batch_size_validation = 100

iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("text_field", "num_tokens")])
iterator.index_with(vocab)

iterator_validation = BucketIterator(batch_size = batch_size_validation, sorting_keys=[("text_field", "num_tokens")])
iterator_validation.index_with(vocab)

num_batches = int(np.floor(len(train_dataset)/batch_size))
num_batches_validation = int(np.floor(len(validation_dataset)/batch_size_validation))
	# Create the iterator over the data:
batches_iterable = iterator(train_dataset)
batches_iterable_validation = iterator_validation(validation_dataset)

"""
##############################################################################
######################### TRAINING #######################################
Probably should not use this one because we want more features for the Bayesian elements.
This trainer should also save the model ? 
Esempio n. 39
0
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
#### We next need to specify the sequence encoder. The need for <code>PytorchSeq2SeqWrapper</code> here is slightly unfortunate (and if you use <a href = "https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files">configuration files</a> you won't need to worry about it) but here it's required to add some extra functionality (and a cleaner interface) to the built-in PyTorch module. In AllenNLP we do everything batch first, so we specify that as well.
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

#### Finally, we can instantiate the model.
model = LstmTagger(word_embeddings, lstm, vocab)

#### Now we're ready to train the model. The first thing we'll need is an optimizer. We can just use PyTorch's stochastic gradient descent.
optimizer = optim.SGD(model.parameters(), lr=0.1)

#### And we need a <code>DataIterator</code> that handles batching for our datasets. The <code>BucketIterator</code> sorts instances by the specified fields in order to create batches with similar sequence lengths. Here we indicate that we want to sort the instances by the number of tokens in the sentence field.
iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])
#### We also specify that the iterator should make sure its instances are indexed using our vocabulary; that is, that their strings have been converted to integers using the mapping we previously created.
iterator.index_with(vocab)

#### Now we instantiate our <code>Trainer</code> and run it. Here we tell it to run for 1000 epochs and to stop training early if it ever spends 10 epochs without the validation metric improving. The default validation metric is loss (which improves by getting smaller), but it's also possible to specify a different metric and direction (e.g. accuracy should get bigger).
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000)

#### When we launch it it will print a progress bar for each epoch that includes both the "loss" and the "accuracy" metric. If our model is good, the loss should go down and the accuracy up as we train.
trainer.train()

#### As in the original PyTorch tutorial, we'd like to look at the predictions our model generates. AllenNLP contains a <code>Predictor</code> abstraction that takes inputs, converts them to instances, feeds them through your model, and returns JSON-serializable results. Often you'd need to implement your own Predictor, but AllenNLP already has a <code>SentenceTaggerPredictor</code> that works perfectly here, so we can use it. It requires our model (for making predictions) and a dataset reader (for creating instances).
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)