def test_create_batches_groups_correctly(self):
     iterator = BucketIterator(batch_size=2, padding_noise=0, sorting_keys=[('text', 'num_tokens')])
     batches = list(iterator._create_batches(self.instances, shuffle=False))
     grouped_instances = [batch.instances for batch in batches]
     assert grouped_instances == [[self.instances[4], self.instances[2]],
                                  [self.instances[0], self.instances[1]],
                                  [self.instances[3]]]
    def test_from_params(self):
        # pylint: disable=protected-access
        params = Params({})

        with pytest.raises(ConfigurationError):
            iterator = BucketIterator.from_params(params)

        sorting_keys = [("s1", "nt"), ("s2", "nt2")]
        params['sorting_keys'] = sorting_keys
        iterator = BucketIterator.from_params(params)

        assert iterator._sorting_keys == sorting_keys
        assert iterator._padding_noise == 0.1
        assert not iterator._biggest_batch_first
        assert iterator._batch_size == 32

        params = Params({
                "sorting_keys": sorting_keys,
                "padding_noise": 0.5,
                "biggest_batch_first": True,
                "batch_size": 100
        })

        iterator = BucketIterator.from_params(params)
        assert iterator._sorting_keys == sorting_keys
        assert iterator._padding_noise == 0.5
        assert iterator._biggest_batch_first
        assert iterator._batch_size == 100
 def test_biggest_batch_first_works(self):
     iterator = BucketIterator(batch_size=2,
                               padding_noise=0,
                               sorting_keys=[('text', 'num_tokens')],
                               biggest_batch_first=True)
     iterator.index_with(self.vocab)
     batches = list(iterator._create_batches(self.instances, shuffle=False))
     grouped_instances = [batch.instances for batch in batches]
     assert grouped_instances == [[self.instances[3]],
                                  [self.instances[0], self.instances[1]],
                                  [self.instances[4], self.instances[2]]]
 def test_create_batches_groups_correctly_with_max_instances(self):
     # If we knew all the instances, the correct order is 4 -> 2 -> 0 -> 1 -> 3.
     # Here max_instances_in_memory is 3, so we load instances [0, 1, 2]
     # and then bucket them by size into batches of size 2 to get [2, 0] -> [1].
     # Then we load the remaining instances and bucket them by size to get [4, 3].
     iterator = BucketIterator(batch_size=2,
                               padding_noise=0,
                               sorting_keys=[('text', 'num_tokens')],
                               max_instances_in_memory=3)
     for test_instances in (self.instances, self.lazy_instances):
         batches = list(iterator._create_batches(test_instances, shuffle=False))
         grouped_instances = [batch.instances for batch in batches]
         assert grouped_instances == [[self.instances[2], self.instances[0]],
                                      [self.instances[1]],
                                      [self.instances[4], self.instances[3]]]
    def test_bucket_iterator_maximum_samples_per_batch(self):
        iterator = BucketIterator(
                batch_size=3,
                padding_noise=0,
                sorting_keys=[('text', 'num_tokens')],
                maximum_samples_per_batch=['num_tokens', 9]
        )
        iterator.index_with(self.vocab)
        batches = list(iterator._create_batches(self.instances, shuffle=False))
        stats = self.get_batches_stats(batches)

        # ensure all instances are in a batch
        assert stats['total_instances'] == len(self.instances)

        # ensure correct batch sizes
        assert stats['batch_lengths'] == [2, 2, 1]

        # ensure correct sample sizes (<= 9)
        assert stats['sample_sizes'] == [6, 8, 9]
    def test_bucket_iterator_maximum_samples_per_batch(self):
        iterator = BucketIterator(
                batch_size=3, padding_noise=0,
                sorting_keys=[('text', 'num_tokens')],
                maximum_samples_per_batch=['num_tokens', 9]
        )
        batches = list(iterator._create_batches(self.instances, shuffle=False))

        # ensure all instances are in a batch
        grouped_instances = [batch.instances for batch in batches]
        num_instances = sum(len(group) for group in grouped_instances)
        assert num_instances == len(self.instances)

        # ensure all batches are sufficiently small
        for batch in batches:
            batch_sequence_length = max(
                    [instance.get_padding_lengths()['text']['num_tokens']
                     for instance in batch.instances]
            )
            assert batch_sequence_length * len(batch.instances) <= 9
    def test_maximum_samples_per_batch_packs_tightly(self):
        token_counts = [10, 4, 3]
        test_instances = self.create_instances_from_token_counts(token_counts)

        iterator = BucketIterator(
                batch_size=3,
                padding_noise=0,
                sorting_keys=[('text', 'num_tokens')],
                maximum_samples_per_batch=['num_tokens', 11]
        )
        iterator.index_with(self.vocab)
        batches = list(iterator._create_batches(test_instances, shuffle=False))
        stats = self.get_batches_stats(batches)

        # ensure all instances are in a batch
        assert stats['total_instances'] == len(test_instances)

        # ensure correct batch sizes
        assert stats['batch_lengths'] == [2, 1]

        # ensure correct sample sizes (<= 11)
        assert stats['sample_sizes'] == [8, 10]
Exemple #8
0
def load_SQUAD1_dataset(cf_a,vocab):
    """
    Loads the dataset and creates iterators and so on
    """
    ## Create the Data Reader with the Tokenization and indexing
    if (cf_a.datareader_lazy):
        #If we do lazy loading, the training will be slower but we dont have RAM so....
        # We also can specify:
        instances_per_epoch_train = cf_a.instances_per_epoch_train
        instances_per_epoch_validation = cf_a.instances_per_epoch_validation
        max_instances_in_memory = cf_a.max_instances_in_memory 
    else:
        instances_per_epoch_train = None
        instances_per_epoch_validation = None
        max_instances_in_memory = None
    
    ## Instantiate the datareader
    squad_reader = Squad1Reader(lazy = cf_a.datareader_lazy, 
                                tokenizer_indexer_type = cf_a.tokenizer_indexer_type)
    
    ## Load the datasets
    train_dataset = squad_reader.read(file_path = cf_a.train_squad1_file)
    validation_dataset =  squad_reader.read(file_path = cf_a.validation_squad1_file)
    """
    ########################## ITERATORS  ############################
    Iterator that will get the samples for the problem
    """

    if(cf_a.datareader_lazy == False):
        instances_per_epoch_train = len(train_dataset)
        instances_per_epoch_validation = len(validation_dataset)
    
    train_iterator = BucketIterator(batch_size= cf_a.batch_size_train, instances_per_epoch = instances_per_epoch_train,
                              max_instances_in_memory = max_instances_in_memory,
                              sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]])
    train_iterator.index_with(vocab)
    
    validation_iterator = BucketIterator(batch_size= cf_a.batch_size_validation, instances_per_epoch = instances_per_epoch_validation,
                              max_instances_in_memory = max_instances_in_memory,
                              sorting_keys=[["passage", "num_tokens"], ["question", "num_tokens"]])
    
    validation_iterator.index_with(vocab)
    
    num_batches = int(np.ceil(instances_per_epoch_train/cf_a.batch_size_train))
    num_batches_validation = int(np.ceil(instances_per_epoch_validation/cf_a.batch_size_validation))
    
    # Create the iterator over the data:
    train_iterable = train_iterator(train_dataset)
    validation_iterable = validation_iterator(validation_dataset)
    
    return squad_reader, num_batches, train_iterable, num_batches_validation, validation_iterable
def get_training_values (model, vocab, train_dataset, validation_dataset,
                         tr_data_loss, val_data_loss, KL_loss,final_loss_tr, final_loss_val, batch_size=100):
    model.eval()
    model.set_posterior_mean(True)

    data_loss_validation = 0
    data_loss_train = 0
    loss_validation = 0
    loss_train = 0
    
    # Create own iterators for this:
    iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("text_field", "num_tokens")])
    iterator.index_with(vocab)
    
    iterator_validation = BucketIterator(batch_size = batch_size, sorting_keys=[("text_field", "num_tokens")])
    iterator_validation.index_with(vocab)
    
    num_batches = int(np.floor(len(train_dataset)/batch_size))
    num_batches_validation = int(np.floor(len(validation_dataset)/batch_size_validation))
    	# Create the iterator over the data:
    batches_iterable = iterator(train_dataset)
    batches_iterable_validation = iterator(validation_dataset)

    # Compute the validation accuracy by using all the Validation dataset but in batches.
    for j in range(num_batches_validation):
        batch = next(batches_iterable_validation)
        tensor_dict = batch # Already converted
        data_loss_validation += model.get_data_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
        loss_validation += model.get_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
 
    data_loss_validation = data_loss_validation/num_batches_validation
    loss_validation = loss_validation/num_batches_validation
    
    ## Same for training
    for j in range(num_batches):
        batch = next(batches_iterable)
        tensor_dict = batch # Already converted
        data_loss_train += model.get_data_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
        loss_train += model.get_loss(tensor_dict["text_field"],tensor_dict["tags_field"])
    
    data_loss_train = data_loss_train/num_batches
    loss_train = loss_train/num_batches
    
    tr_data_loss.append(data_loss_train)
    val_data_loss.append(data_loss_validation)
    KL_loss.append(-model.get_KL_loss())
    final_loss_tr.append(loss_train)
    final_loss_val.append(loss_validation)

    model.train()
    model.set_posterior_mean(False)
                    similarity_function=simfunc,
                    projection_feedforward=projection_feedforward,
                    output_feedforward=output_feedforward,
                    output_logit=output_logit)

    if torch.cuda.is_available():
        cuda_device = 0

        model = model.cuda(cuda_device)
    else:

        cuda_device = -1

    optimizer = optim.SGD(model.parameters(), lr=0.1)

    iterator = BucketIterator(batch_size=2, sorting_keys=[("premise", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                    optimizer=optimizer,
                    iterator=iterator,
                    train_dataset=train_dataset,
                    validation_dataset=validation_dataset,
                    patience=10,
                    num_epochs=1,
                    cuda_device=cuda_device)

    trainer.train()

    # predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
    attention = DotProductAttention()

    max_decoding_steps = 100   # TODO: make this variable
    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True).cuda()
    
<<<<<<< HEAD
    optimizer = optim.Adam(model.parameters(), lr=lr)
=======
    optimizer = optim.Adam(model.parameters())
>>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768
    iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
<<<<<<< HEAD
                      num_epochs=8,
=======
                      validation_dataset=validation_dataset,
                      patience=7,
                      num_epochs=25,
>>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768
                      cuda_device=cuda_id)
def main():

    parser = argparse.ArgumentParser(description='Input, output and other configurations')

    # Old eval on general RC
    # parser.add_argument('--csv_path', type=str,
    #                     default="/Users/xinq/Desktop/lit-review/de-contextualize/output/reproducibility_sentence_output_annotated_Xin_021319.csv")
    parser.add_argument('--csv_path', type=str,
                        default="output/reproducibility_sentence_output_to_annotate_021919_randomized-Xin.csv")


    # parser.add_argument('--output', type=str, default="../output/reproducibility_sentence_output_to_annotate_new.csv")
    # parser.add_argument('--no_extract_candidates', dest='extract_candidates', action='store_false', default=True)
    parser.add_argument('--csv_test_path', type=str,
                        default="output/reproducibility_sentence.csv")
    parser.add_argument('--csv_out_path', type=str,
                        default="output/reproducibility_sentence_scored.csv")
    parser.add_argument('--embedding_dim', type=int, default=128)
    parser.add_argument('--hidden_dim', type=int, default=128)
    parser.add_argument('--glove', dest='glove', action='store_true', default=False)
    parser.add_argument('--small_test', dest='small_test', action='store_true', default=False)
    parser.add_argument('--model_path',type=str,default="model/model.th")
    parser.add_argument('--vocab_path',type=str,default="model/vocab.th")
    parser.add_argument('--embedding_path',type=str,default="model/embedding.th")
    parser.add_argument('--no_test', dest='no_test', action='store_true',default=False)
    # parser.add_argument('--split', type=int, default=0)

    args = parser.parse_args()

    reader = ReproducibilityClaimDatasetReader()
    train_dataset = reader.read(args.csv_path)
    reader.switch_to_test()
    ## Note: we implemented train/dev split (over the single annotation files that we have)
    ## Note (cont.) such that unlabelled are automatically considered as dev_dataset.
    dev_dataset = reader.read(args.csv_path)  # Using the same path here
    if args.small_test or args.no_test:
        test_dataset = dev_dataset
    else:
        test_dataset = reader.read(args.csv_test_path)  # The test set contains all sentence from 100 CHI 2018 papers

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3})
    # input(vocab._non_padded_namespaces) ## Still confused!!
    # print(vocab.get_index_to_token_vocabulary("tokens")) ##  Output is like {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'the', 3: 'to', 4: 'of', 5: 'and', 6: 'a', 7: 'in', 8: 'that', 9: 'for', 10: 'with'
    # print(vocab.__dict__)
    print("Namespaces of vocab are", vocab._token_to_index.keys())

    # input("Get label_idx from label "+str(vocab.get_token_index("2","labels"))+str(type(vocab.get_token_index("2","labels"))))
    # input("Get label_idx from label "+str(vocab.get_token_index("1","labels")))
    # input("Get label_idx from label "+str(vocab.get_token_index("0","labels")))
    # input()

    print(vocab.get_vocab_size("tokens"), "vocab.get_vocab_size(tokens")
    print(vocab.__dict__['_token_to_index'].__dict__['_non_padded_namespaces'])
    print(vocab.__dict__['_token_to_index'].__dict__['_padded_function'])
    print(vocab.__dict__['_padding_token'])
    print(vocab.__dict__['_oov_token'])
    # input()

    EMBEDDING_DIM = args.embedding_dim if not args.glove else 100
    HIDDEN_DIM = args.hidden_dim

    # TODO: switch to Glove for now!? (worked on 022119)

    # If you go back to where we defined our DatasetReader, the default parameters included a single index called "tokens", \
    # so our mapping just needs an embedding corresponding to that index.
    # token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
    #                             embedding_dim=EMBEDDING_DIM)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)

    if args.glove:
        params = Params({"pretrained_file": "output/glove.6B." + str(EMBEDDING_DIM) + "d" + ".txt",
                         "embedding_dim": EMBEDDING_DIM})
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM).from_params(
            vocab=vocab, params=params)


        #                             pretrained_file="/Users/xinq/Downloads/glove/glove.6B." + str(
        #                                 EMBEDDING_DIM) + "d" + ".txt")

    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    # torch.save(word_embeddings,open("../model/toy","wb"))
    # word_embeddings=torch.load(open("../model/toy","rb"))

    lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) # batch_size * seqlen * embedding/hidden
    model = LSTMClassifier(word_embeddings, lstm, vocab)

    # TODO: implement self-attention based on paper: (efficiency is also important!)

    # TODO: Option A: biattention (biattentive classifier)
    #  # Compute biattention. This is a special case since the inputs are the same.
    #         attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous()) # https://pytorch.org/docs/stable/torch.html#torch.bmm
    #         attention_weights = util.masked_softmax(attention_logits, text_mask)
    #        TODO: confirm where is text_mask -> text_mask = util.get_text_field_mask(tokens).float()
    #         encoded_text = util.weighted_sum(encoded_tokens, attention_weights) # function https://github.com/allenai/allennlp/blob/6d8da97312bfbde05a41558668ff63d92a9928e9/allennlp/nn/util.py#L530

    # TODO: Option B: Bilinear attention
    # Bilinear matrix attention  (对吗???)  ``X W Y^T + b``. W=weight
    #         intermediate = torch.matmul(matrix_1.unsqueeze(1), weight)
    #         final = torch.matmul(intermediate, matrix_2.unsqueeze(1).transpose(2, 3))
    #         return self._activation(final.squeeze(1) + self._bias)
    #

    # TODO (cont.) a structured self-attentive sentence embedding https://arxiv.org/pdf/1703.03130.pdf

    # optimizer = optim.SGD(model.parameters(), lr=0.1)
    # optimizer=optim.Adam(model.parameters,lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=1e-3,
                           weight_decay=1e-5)  # current setting that coverges on train: optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])  # sort by num_tokens
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      # validation_dataset=None, #
                      validation_dataset=train_dataset,
                      patience=10,
                      num_epochs=15)  # 10  # seems that w/ Glove 20 will be better...
    trainer.train()

    predictor = SentenceTaggerPredictor(model,
                                        dataset_reader=reader)  # SentenceTagger shares the same logic as sentence classification predictor

    '''
    allennlp/allennlp/commands/predict.py
    
    The ``predict`` subcommand allows you to make bulk JSON-to-JSON
    or dataset to JSON predictions using a trained model and its
    :class:`~allennlp.service.predictors.predictor.Predictor` wrapper.
    '''

    if not args.no_test:
        sents = []
        delimiter = "pdf_"

        # for line in open(args.csv_test_path)
        for instance in tqdm(test_dataset):  # Loop over every single instance on test_dataset
            # print(instance.fields['tokens']['tokens'].__dict__)
            # print((instance.fields['tokens'][0].__dict__)) # NOTE: stop here
            # input()
            prediction = predictor.predict_instance(instance)
            # logits = prediction['logits']
            # print(logits)
            softmax = prediction['softmax']
            # print(softmax)
            # input()
            # label_id = np.argmax(logits)
            pos_label_idx = vocab.get_token_index("2",
                                                  "labels")  # getting the corresponding dimension integer idx for label "2"
            pos_score = softmax[pos_label_idx]
            # print("metadata for this instance",instance.fields['metadata']['sent_id'],type(instance.fields['metadata']['sent_id']))
            # print(str(instance.fields['tokens']))
            # print(instance.fields['tokens'].get_text())
            # input()

            # input(type(instance.fields['tokens']))
            # input(instance.fields['tokens'])

            # sents.append({"paperID": instance.fields['metadata']['sent_id'].split(delimiter)[0], "sent_pos": int(
            #     instance.fields['metadata']['sent_id'].split(delimiter)[1]), "text": instance.fields['tokens'].get_text(),
            #               "pos_score": float(pos_score)})

            sents.append({"paperID": instance.fields['metadata']['sent_id'].split(delimiter)[0], "sent_pos": int(
                instance.fields['metadata']['sent_id'].split(delimiter)[1]), "text": instance.fields['metadata']['text'],
                "pos_score": float(pos_score)})
            

        # write output into a .csv file. Takes about 2 mins
        df = pd.DataFrame(sents)

        # TODO: change the sort_values criteria when we generate the eval plot
        # df = df.sort_values(by=['paperID', 'pos_score'], ascending=False)
        df = df.sort_values(by=['pos_score'], ascending=False)
        df.to_csv(args.csv_out_path)

    # print("label_id=np.argmax(logits)", pos_label_idx, model.vocab.get_token_from_index(label_id, 'labels'))

    # print(instance.__dict__)
    # print(type(instance))

    # logits = predictor.predict("We allow participants to speak out loud.")['logits']
    # label_id=np.argmax(logits)
    # print("label_id=np.argmax(logits)",label_id, model.vocab.get_token_from_index(label_id, 'labels'))

    # tag_ids = np.argmax(tag_logits, axis=-1)
    # print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

    # # Here's how to save the model.
    with open(args.model_path, 'wb') as f:
        torch.save(model.state_dict(), f)

    vocab.save_to_files(args.vocab_path)
    torch.save(word_embeddings,open(args.embedding_path,"wb"))
Exemple #13
0
def train():
    reader = PWKPReader()
    train_dataset = reader.read(train_path)
    valid_dataset = reader.read(dev_path)
    if os.path.exists(vocab_dir):
        vocab = Vocabulary.from_files(vocab_dir)
    else:
        vocab = Vocabulary.from_instances(instances=train_dataset,
                                          max_vocab_size=opt.vocab_size)
        vocab.save_to_files(vocab_dir)
    iterator = BucketIterator(batch_size=opt.batch_size,
                              sorting_keys=[("src", "num_tokens"),
                                            ("tgt", "num_tokens")])
    iterator.index_with(vocab)

    model = Seq2Seq(emb_size=opt.emb_size,
                    hidden_size=opt.hidden_size,
                    enc_layers=opt.enc_layers,
                    dec_layers=opt.dec_layers,
                    dropout=opt.dropout,
                    bidirectional=opt.bidirectional,
                    beam_size=opt.beam_size,
                    label_smoothing=opt.label_smoothing,
                    vocab=vocab)

    optimizer = optim.Adam(model.parameters(), lr=opt.lr)
    #learning_rate_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=1, gamma=opt.lr_decay)

    val_iterator = BasicIterator(batch_size=opt.batch_size)
    val_iterator.index_with(vocab)

    predictor = Predictor(iterator=val_iterator,
                          max_decoding_step=opt.max_step,
                          vocab=vocab,
                          reader=reader,
                          data_path=test_path,
                          log_dir=save_dir,
                          map_path=ner_path,
                          cuda_device=opt.gpu)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        #learning_rate_scheduler=learning_rate_scheduler,
        learning_rate_decay=opt.lr_decay,
        ema_decay=opt.ema_decay,
        predictor=predictor,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=valid_dataset,
        validation_metric='+bleu',
        cuda_device=opt.gpu,
        num_epochs=opt.epoch,
        serialization_dir=save_dir,
        num_serialized_models_to_keep=5,
        #model_save_interval=60,
        #summary_interval=500,
        should_log_parameter_statistics=False,
        grad_norm=10)

    trainer.train()
Exemple #14
0
        UccaSpanParserDatasetReader(word_tokenizer, word_indexer).read(folder)
        for folder in [train_dataset_folder, validation_dataset_folder])

    if os.path.exists(vocab_dir):
        vocab = Vocabulary.from_files(vocab_dir)
    else:
        vocab = Vocabulary.from_instances(
            itertools.chain(train_ds, validation_ds))
        vocab.save_to_files(vocab_dir)

    vocab_namespaces = vocab._index_to_token.keys()
    max_vocab_size = max(
        [vocab.get_vocab_size(namespace) for namespace in vocab_namespaces])
    iterator = BucketIterator(
        batch_size=batch_size,
        # This is for testing. To see how big of batch size the GPU can handle.
        biggest_batch_first=True,
        sorting_keys=[("tokens", "num_tokens")],
    )
    iterator.index_with(vocab)

    linguistic_features_embedding = Embedding(
        num_embeddings=max_vocab_size + 2,
        embedding_dim=linguistic_features_embedding_dim,
        # padding_index=0 I do not understand what is does
    )
    bert_embedder = PretrainedBertEmbedder(
        pretrained_model=bert_mode,
        top_layer_only=False,
        requires_grad=bert_finetuning,
    )
    word_embedder = BasicTextFieldEmbedder(
Exemple #15
0
def run_model(args):
    st_ds_conf = get_updated_settings(args)
    reader = data_adapter.GeoQueryDatasetReader()
    training_set = reader.read(config.DATASETS[args.dataset].train_path)
    try:
        validation_set = reader.read(config.DATASETS[args.dataset].dev_path)
    except:
        validation_set = None

    vocab = allennlp.data.Vocabulary.from_instances(training_set)
    model = get_model(vocab, st_ds_conf)
    device_tag = "cpu" if config.DEVICE < 0 else f"cuda:{config.DEVICE}"
    if args.models:
        model.load_state_dict(
            torch.load(args.models[0], map_location=device_tag))

    if not args.test or not args.models:
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=st_ds_conf['batch_sz'])
        iterator.index_with(vocab)

        optim = torch.optim.Adam(model.parameters(),
                                 lr=config.ADAM_LR,
                                 betas=config.ADAM_BETAS,
                                 eps=config.ADAM_EPS)

        savepath = os.path.join(
            config.SNAPSHOT_PATH, args.dataset, 'base_s2s',
            datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" +
            args.memo)
        if not os.path.exists(savepath):
            os.makedirs(savepath, mode=0o755)

        trainer = allennlp.training.Trainer(
            model=model,
            optimizer=optim,
            iterator=iterator,
            train_dataset=training_set,
            validation_dataset=validation_set,
            serialization_dir=savepath,
            cuda_device=config.DEVICE,
            num_epochs=config.TRAINING_LIMIT,
            grad_clipping=config.GRAD_CLIPPING,
        )

        trainer.train()

    else:
        testing_set = reader.read(config.DATASETS[args.dataset].test_path)
        model.eval()

        if config.DEVICE > -1:
            model = model.cuda(config.DEVICE)

        predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader)

        for instance in tqdm.tqdm(testing_set, total=len(testing_set)):
            print('SRC: ', instance.fields['source_tokens'].tokens)
            print(
                'GOLD:', ' '.join(
                    str(x)
                    for x in instance.fields['target_tokens'].tokens[1:-1]))
            del instance.fields['target_tokens']
            output = predictor.predict_instance(instance)
            print('PRED:', ' '.join(output['predicted_tokens']))
Exemple #16
0
def run(trainp="overnight/calendar_train_delex.tsv",
        testp="overnight/calendar_test_delex.tsv",
        batsize=8,
        embdim=50,
        encdim=50,
        maxtime=100,
        lr=.001,
        gpu=0,
        cuda=False,
        epochs=20):
    device = torch.device("cuda", gpu) if cuda else torch.device("cpu")
    tt = q.ticktock("script")
    tt.tick("loading data")

    def tokenizer(x: str, splitter: WordSplitter = None) -> List[str]:
        return [xe.text for xe in splitter.split_words(x)]

    reader = OvernightReader(
        partial(tokenizer, splitter=JustSpacesWordSplitter()),
        partial(tokenizer, splitter=JustSpacesWordSplitter()),
        SingleIdTokenIndexer(namespace="nl_tokens"),
        SingleIdTokenIndexer(namespace="fl_tokens"))
    trainds = reader.read(trainp)
    testds = reader.read(testp)
    tt.tock("data loaded")

    tt.tick("building vocabulary")
    vocab = Vocabulary.from_instances(trainds)
    tt.tock("vocabulary built")

    tt.tick("making iterator")
    iterator = BucketIterator(sorting_keys=[("nl", "num_tokens"),
                                            ("fl", "num_tokens")],
                              batch_size=batsize,
                              biggest_batch_first=True)
    iterator.index_with(vocab)
    batch = next(iter(iterator(trainds)))
    #print(batch["id"])
    #print(batch["nl"])
    tt.tock("made iterator")

    # region model
    nl_emb = Embedding(vocab.get_vocab_size(namespace="nl_tokens"),
                       embdim,
                       padding_index=0)
    fl_emb = Embedding(vocab.get_vocab_size(namespace="fl_tokens"),
                       embdim,
                       padding_index=0)
    nl_field_emb = BasicTextFieldEmbedder({"tokens": nl_emb})
    fl_field_emb = BasicTextFieldEmbedder({"tokens": fl_emb})

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(embdim, encdim, bidirectional=True, batch_first=True))
    attention = DotProductAttention()

    smodel = Seq2Seq(vocab,
                     nl_field_emb,
                     encoder,
                     maxtime,
                     target_embedding_dim=embdim,
                     attention=attention,
                     target_namespace='fl_tokens',
                     beam_size=1,
                     use_bleu=True)

    smodel_out = smodel(batch["nl"], batch["fl"])

    smodel.to(device)

    optim = torch.optim.Adam(smodel.parameters(), lr=lr)
    trainer = Trainer(model=smodel,
                      optimizer=optim,
                      iterator=iterator,
                      train_dataset=trainds,
                      validation_dataset=testds,
                      num_epochs=epochs,
                      cuda_device=gpu if cuda else -1)

    metrics = trainer.train()

    sys.exit()

    class MModel(Model):
        def __init__(self, nlemb: Embedding, flemb: Embedding,
                     vocab: Vocabulary, **kwargs):
            super(MModel, self).__init__(vocab, **kwargs)
            self.nlemb, self.flemb = nlemb, flemb

        @overrides
        def forward(self, nl: Dict[str, torch.Tensor],
                    fl: Dict[str, torch.Tensor], id: Any):
            nlemb = self.nlemb(nl["tokens"])
            flemb = self.flemb(fl["tokens"])
            print(nlemb.size())
            pass

    m = MModel(nl_emb, fl_emb, vocab)
    batch = next(iter(iterator(trainds)))
    out = m(**batch)
Exemple #17
0
def main(param2val):

    # params
    params = Params.from_param2val(param2val)
    print(params, flush=True)

    #  paths
    project_path = Path(param2val['project_path'])
    save_path = Path(param2val['save_path'])
    srl_eval_path = project_path / 'perl' / 'srl-eval.pl'
    data_path_mlm = project_path / 'data' / 'training' / f'{params.corpus_name}_mlm.txt'
    data_path_train_srl = project_path / 'data' / 'training' / f'{params.corpus_name}_no-dev_srl.txt'
    data_path_devel_srl = project_path / 'data' / 'training' / f'human-based-2018_srl.txt'
    data_path_test_srl = project_path / 'data' / 'training' / f'human-based-2008_srl.txt'
    childes_vocab_path = project_path / 'data' / f'{params.corpus_name}_vocab.txt'
    google_vocab_path = project_path / 'data' / 'bert-base-cased.txt'  # to get word pieces

    # word-piece tokenizer - defines input vocabulary
    vocab = load_vocab(childes_vocab_path, google_vocab_path,
                       params.vocab_size)
    # TODO testing google vocab with wordpieces

    assert vocab['[PAD]'] == 0  # AllenNLP expects this
    assert vocab['[UNK]'] == 1  # AllenNLP expects this
    assert vocab['[CLS]'] == 2
    assert vocab['[SEP]'] == 3
    assert vocab['[MASK]'] == 4
    wordpiece_tokenizer = WordpieceTokenizer(vocab)
    print(f'Number of types in vocab={len(vocab):,}')

    # load utterances for MLM task
    utterances = load_utterances_from_file(data_path_mlm)
    train_utterances, devel_utterances, test_utterances = split(utterances)

    # load propositions for SLR task
    propositions = load_propositions_from_file(data_path_train_srl)
    train_propositions, devel_propositions, test_propositions = split(
        propositions)
    if data_path_devel_srl.is_file(
    ):  # use human-annotated data as devel split
        print(f'Using {data_path_devel_srl.name} as SRL devel split')
        devel_propositions = load_propositions_from_file(data_path_devel_srl)
    if data_path_test_srl.is_file():  # use human-annotated data as test split
        print(f'Using {data_path_test_srl.name} as SRL test split')
        test_propositions = load_propositions_from_file(data_path_test_srl)

    # converters handle conversion from text to instances
    converter_mlm = ConverterMLM(params, wordpiece_tokenizer)
    converter_srl = ConverterSRL(params, wordpiece_tokenizer)

    # get output_vocab
    # note: Allen NLP vocab holds labels, wordpiece_tokenizer.vocab holds input tokens
    # what from_instances() does:
    # 1. it iterates over all instances, and all fields, and all token indexers
    # 2. the token indexer is used to update vocabulary count, skipping words whose text_id is already set
    # 4. a PADDING and MASK symbol are added to 'tokens' namespace resulting in vocab size of 2
    # input tokens are not indexed, as they are already indexed by bert tokenizer vocab.
    # this ensures that the model is built with inputs for all vocab words,
    # such that words that occur only in LM or SRL task can still be input

    # make instances once - this allows iterating multiple times (required when num_epochs > 1)
    train_instances_mlm = converter_mlm.make_instances(train_utterances)
    devel_instances_mlm = converter_mlm.make_instances(devel_utterances)
    test_instances_mlm = converter_mlm.make_instances(test_utterances)
    train_instances_srl = converter_srl.make_instances(train_propositions)
    devel_instances_srl = converter_srl.make_instances(devel_propositions)
    test_instances_srl = converter_srl.make_instances(test_propositions)
    all_instances_mlm = chain(train_instances_mlm, devel_instances_mlm,
                              test_instances_mlm)
    all_instances_srl = chain(train_instances_srl, devel_instances_srl,
                              test_instances_srl)

    # make vocab from all instances
    output_vocab_mlm = Vocabulary.from_instances(all_instances_mlm)
    output_vocab_srl = Vocabulary.from_instances(all_instances_srl)
    # print(f'mlm vocab size={output_vocab_mlm.get_vocab_size()}')  # contain just 2 tokens
    # print(f'srl vocab size={output_vocab_srl.get_vocab_size()}')  # contain just 2 tokens
    assert output_vocab_mlm.get_vocab_size(
        'tokens') == output_vocab_srl.get_vocab_size('tokens')

    # BERT
    print('Preparing Multi-task BERT...')
    input_vocab_size = len(converter_mlm.wordpiece_tokenizer.vocab)
    bert_config = BertConfig(
        vocab_size_or_config_json_file=input_vocab_size,  # was 32K
        hidden_size=params.hidden_size,  # was 768
        num_hidden_layers=params.num_layers,  # was 12
        num_attention_heads=params.num_attention_heads,  # was 12
        intermediate_size=params.intermediate_size)  # was 3072
    bert_model = BertModel(config=bert_config)
    # Multi-tasking BERT
    mt_bert = MTBert(vocab_mlm=output_vocab_mlm,
                     vocab_srl=output_vocab_srl,
                     bert_model=bert_model,
                     embedding_dropout=params.embedding_dropout)
    mt_bert.cuda()
    num_params = sum(p.numel() for p in mt_bert.parameters()
                     if p.requires_grad)
    print('Number of model parameters: {:,}'.format(num_params), flush=True)

    # optimizers
    optimizer_mlm = BertAdam(params=mt_bert.parameters(), lr=params.lr)
    optimizer_srl = BertAdam(params=mt_bert.parameters(), lr=params.lr)
    move_optimizer_to_cuda(optimizer_mlm)
    move_optimizer_to_cuda(optimizer_srl)

    # batching
    bucket_batcher_mlm = BucketIterator(batch_size=params.batch_size,
                                        sorting_keys=[('tokens', "num_tokens")
                                                      ])
    bucket_batcher_mlm.index_with(output_vocab_mlm)
    bucket_batcher_srl = BucketIterator(batch_size=params.batch_size,
                                        sorting_keys=[('tokens', "num_tokens")
                                                      ])
    bucket_batcher_srl.index_with(output_vocab_srl)

    # big batcher to speed evaluation - 1024 is too big
    bucket_batcher_mlm_large = BucketIterator(batch_size=512,
                                              sorting_keys=[('tokens',
                                                             "num_tokens")])
    bucket_batcher_srl_large = BucketIterator(batch_size=512,
                                              sorting_keys=[('tokens',
                                                             "num_tokens")])
    bucket_batcher_mlm_large.index_with(output_vocab_mlm)
    bucket_batcher_srl_large.index_with(output_vocab_srl)

    # init performance collection
    name2col = {
        'devel_pps': [],
        'devel_f1s': [],
    }

    # init
    eval_steps = []
    train_start = time.time()
    loss_mlm = None
    no_mlm_batches = False
    step = 0

    # generators
    train_generator_mlm = bucket_batcher_mlm(train_instances_mlm,
                                             num_epochs=params.num_mlm_epochs)
    train_generator_srl = bucket_batcher_srl(
        train_instances_srl, num_epochs=None)  # infinite generator
    num_train_mlm_batches = bucket_batcher_mlm.get_num_batches(
        train_instances_mlm)
    if params.srl_interleaved:
        max_step = num_train_mlm_batches
    else:
        max_step = num_train_mlm_batches * 2
    print(f'Will stop training at step={max_step:,}')

    while step < max_step:

        # TRAINING
        if step != 0:  # otherwise evaluation at step 0 is influenced by training on one batch
            mt_bert.train()

            # masked language modeling task
            try:
                batch_mlm = next(train_generator_mlm)
            except StopIteration:
                if params.srl_interleaved:
                    break
                else:
                    no_mlm_batches = True
            else:
                loss_mlm = mt_bert.train_on_batch('mlm', batch_mlm,
                                                  optimizer_mlm)

            # semantic role labeling task
            if params.srl_interleaved:
                if random.random() < params.srl_probability:
                    batch_srl = next(train_generator_srl)
                    mt_bert.train_on_batch('srl', batch_srl, optimizer_srl)
            elif no_mlm_batches:
                batch_srl = next(train_generator_srl)
                mt_bert.train_on_batch('srl', batch_srl, optimizer_srl)

        # EVALUATION
        if step % config.Eval.interval == 0:
            mt_bert.eval()
            eval_steps.append(step)

            # evaluate perplexity
            devel_generator_mlm = bucket_batcher_mlm_large(devel_instances_mlm,
                                                           num_epochs=1)
            devel_pp = evaluate_model_on_pp(mt_bert, devel_generator_mlm)
            name2col['devel_pps'].append(devel_pp)
            print(f'devel-pp={devel_pp}', flush=True)

            # test sentences
            if config.Eval.test_sentences:
                test_generator_mlm = bucket_batcher_mlm_large(
                    test_instances_mlm, num_epochs=1)
                out_path = save_path / f'test_split_mlm_results_{step}.txt'
                predict_masked_sentences(mt_bert, test_generator_mlm, out_path)

            # probing - test sentences for specific syntactic tasks
            for name in config.Eval.probing_names:
                # prepare data
                probing_data_path_mlm = project_path / 'data' / 'probing' / f'{name}.txt'
                if not probing_data_path_mlm.exists():
                    print(f'WARNING: {probing_data_path_mlm} does not exist')
                    continue
                probing_utterances_mlm = load_utterances_from_file(
                    probing_data_path_mlm)
                # check that probing words are in vocab
                for u in probing_utterances_mlm:
                    # print(u)
                    for w in u:
                        if w == '[MASK]':
                            continue  # not in output vocab
                        # print(w)
                        assert output_vocab_mlm.get_token_index(
                            w, namespace='labels'), w
                # probing + save results to text
                probing_instances_mlm = converter_mlm.make_probing_instances(
                    probing_utterances_mlm)
                probing_generator_mlm = bucket_batcher_mlm(
                    probing_instances_mlm, num_epochs=1)
                out_path = save_path / f'probing_{name}_results_{step}.txt'
                predict_masked_sentences(mt_bert,
                                         probing_generator_mlm,
                                         out_path,
                                         print_gold=False,
                                         verbose=True)

            # evaluate devel f1
            devel_generator_srl = bucket_batcher_srl_large(devel_instances_srl,
                                                           num_epochs=1)
            devel_f1 = evaluate_model_on_f1(mt_bert, srl_eval_path,
                                            devel_generator_srl)

            name2col['devel_f1s'].append(devel_f1)
            print(f'devel-f1={devel_f1}', flush=True)

            # console
            min_elapsed = (time.time() - train_start) // 60
            pp = torch.exp(loss_mlm) if loss_mlm is not None else np.nan
            print(
                f'step {step:<6,}: pp={pp :2.4f} total minutes elapsed={min_elapsed:<3}',
                flush=True)

        # only increment step once in each iteration of the loop, otherwise evaluation may never happen
        step += 1

    # evaluate train perplexity
    if config.Eval.train_split:
        generator_mlm = bucket_batcher_mlm_large(train_instances_mlm,
                                                 num_epochs=1)
        train_pp = evaluate_model_on_pp(mt_bert, generator_mlm)
    else:
        train_pp = np.nan
    print(f'train-pp={train_pp}', flush=True)

    # evaluate train f1
    if config.Eval.train_split:
        generator_srl = bucket_batcher_srl_large(train_instances_srl,
                                                 num_epochs=1)
        train_f1 = evaluate_model_on_f1(mt_bert,
                                        srl_eval_path,
                                        generator_srl,
                                        print_tag_metrics=True)
    else:
        train_f1 = np.nan
    print(f'train-f1={train_f1}', flush=True)

    # test sentences
    if config.Eval.test_sentences:
        test_generator_mlm = bucket_batcher_mlm(test_instances_mlm,
                                                num_epochs=1)
        out_path = save_path / f'test_split_mlm_results_{step}.txt'
        predict_masked_sentences(mt_bert, test_generator_mlm, out_path)

    # probing - test sentences for specific syntactic tasks
    for name in config.Eval.probing_names:
        # prepare data
        probing_data_path_mlm = project_path / 'data' / 'probing' / f'{name}.txt'
        if not probing_data_path_mlm.exists():
            print(f'WARNING: {probing_data_path_mlm} does not exist')
            continue
        probing_utterances_mlm = load_utterances_from_file(
            probing_data_path_mlm)
        probing_instances_mlm = converter_mlm.make_probing_instances(
            probing_utterances_mlm)
        # batch and do inference
        probing_generator_mlm = bucket_batcher_mlm(probing_instances_mlm,
                                                   num_epochs=1)
        out_path = save_path / f'probing_{name}_results_{step}.txt'
        predict_masked_sentences(mt_bert,
                                 probing_generator_mlm,
                                 out_path,
                                 print_gold=False,
                                 verbose=True)

    # put train-pp and train-f1 into pandas Series
    s1 = pd.Series([train_pp], index=[eval_steps[-1]])
    s1.name = 'train_pp'
    s2 = pd.Series([train_f1], index=[eval_steps[-1]])
    s2.name = 'train_f1'

    # return performance as pandas Series
    series_list = [s1, s2]
    for name, col in name2col.items():
        print(f'Making pandas series with name={name} and length={len(col)}')
        s = pd.Series(col, index=eval_steps)
        s.name = name
        series_list.append(s)

    return series_list
class SpanBasedModelForAtsa(ModelTrainTemplate.ModelTrainTemplate):
    """
    2019-acl-Open-Domain Targeted Sentiment Analysisvia Span-Based Extraction and Classification
    """
    def __init__(self, configuration):
        super().__init__(configuration)
        self.data_reader = None
        self.train_data = None
        self.dev_data = None
        self.test_data = None
        self.hard_test_data = None
        self.distinct_categories = None
        self.distinct_polarities = None
        self._load_data()
        if self.configuration['debug']:
            self.train_data = self.train_data[:128]
            self.dev_data = self.dev_data[:128]
            self.test_data = self.test_data[:128]

        self.vocab = None
        self._build_vocab()

        self.iterator = None
        self.val_iterator = None
        self._build_iterator()

    def _get_data_reader(self):
        token_indexer = SingleIdTokenIndexer(namespace="tokens")
        position_indexer = SingleIdTokenIndexer(namespace='position')
        reader = atsa_data_reader.TextAspectInSentimentOut(
            self.distinct_polarities,
            tokenizer=self._get_word_segmenter(),
            token_indexers={"tokens": token_indexer},
            position_indexers={'position': position_indexer},
            configuration=self.configuration)
        return reader

    def _load_data(self):
        data_filepath = self.base_data_dir + 'data'
        if os.path.exists(data_filepath):
            self.train_data, self.dev_data, self.test_data, self.distinct_polarities, max_aspect_term_num = \
                super()._load_object(data_filepath)
            reader = self._get_data_reader()
            self.data_reader = reader
            self.configuration['max_aspect_term_num'] = max_aspect_term_num
        else:
            train_dev_test_data, distinct_polarities = self.dataset.generate_atsa_data(
            )

            if self.configuration['data_augmentation']:
                augment_data_filepath = self.dataset.conceptnet_augment_data_filepath
                with open(augment_data_filepath, mode='rb') as input_file:
                    augment_data = pickle.load(input_file)

            distinct_polarities_new = []
            for polarity in distinct_polarities:
                if polarity != 'conflict':
                    distinct_polarities_new.append(polarity)
            self.distinct_polarities = distinct_polarities_new

            train_dev_test_data_label_indexed = {}
            max_aspect_term_num = -1
            for data_type, data in train_dev_test_data.items():
                if data is None:
                    continue
                data_new = []
                for sample in data:
                    sample_new = [sample[0]]
                    labels_new = []
                    for label in sample[1]:
                        if label.polarity == 'conflict':
                            continue
                        else:
                            labels_new.append(label)
                    if len(labels_new) != 0:
                        max_aspect_term_num = max(max_aspect_term_num,
                                                  len(labels_new))
                        labels_new.sort(key=lambda x: x.from_index)
                        sample_new.append(labels_new)
                        data_new.append(sample_new)
                train_dev_test_data_label_indexed[data_type] = data_new
            if self.configuration['sample_mode'] == 'single':
                max_aspect_term_num = 1
            self.configuration['max_aspect_term_num'] = max_aspect_term_num
            self.model_meta_data['max_aspect_term_num'] = max_aspect_term_num

            reader = self._get_data_reader()
            self.data_reader = reader
            self.train_data = reader.read(
                train_dev_test_data_label_indexed['train'])
            self.dev_data = reader.read(
                train_dev_test_data_label_indexed['dev'])
            self.test_data = reader.read(
                train_dev_test_data_label_indexed['test'])
            data = [
                self.train_data, self.dev_data, self.test_data,
                self.distinct_polarities, max_aspect_term_num
            ]
            super()._save_object(data_filepath, data)

    def _build_vocab(self):
        if self.configuration['train']:
            vocab_file_path = self.base_data_dir + 'vocab'
            if os.path.exists(vocab_file_path):
                self.vocab = super()._load_object(vocab_file_path)
            else:
                data = self.train_data + self.dev_data + self.test_data
                self.vocab = Vocabulary.from_instances(
                    data, max_vocab_size=sys.maxsize)
                super()._save_object(vocab_file_path, self.vocab)
            self.model_meta_data['vocab'] = self.vocab
        else:
            self.vocab = self.model_meta_data['vocab']

    def _build_iterator(self):
        self.iterator = BucketIterator(
            batch_size=self.configuration['batch_size'],
            sorting_keys=[("tokens", "num_tokens")],
        )
        self.iterator.index_with(self.vocab)
        self.val_iterator = BasicIterator(
            batch_size=self.configuration['batch_size'])
        self.val_iterator.index_with(self.vocab)

    def _print_args(self, model):
        n_trainable_params, n_nontrainable_params = 0, 0
        for p in model.parameters():
            n_params = torch.prod(torch.tensor(p.shape)).item()
            if p.requires_grad:
                n_trainable_params += n_params
            else:
                n_nontrainable_params += n_params
        self.logger.info(
            'n_trainable_params: {0}, n_nontrainable_params: {1}'.format(
                n_trainable_params, n_nontrainable_params))
        self.logger.info('> training arguments:')
        for arg in self.configuration.keys():
            self.logger.info('>>> {0}: {1}'.format(arg,
                                                   self.configuration[arg]))

    def _find_model_function_pure(self):
        return pytorch_models.SpanBasedModel

    def _get_position_embeddings_dim(self):
        return 300

    def _is_train_token_embeddings(self):
        return False

    def _find_model_function(self):
        embedding_dim = self.configuration['embed_size']
        embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix'
        if os.path.exists(embedding_matrix_filepath):
            embedding_matrix = super()._load_object(embedding_matrix_filepath)
        else:
            embedding_filepath = self.configuration['embedding_filepath']
            embedding_matrix = embedding._read_embeddings_from_text_file(
                embedding_filepath,
                embedding_dim,
                self.vocab,
                namespace='tokens')
            super()._save_object(embedding_matrix_filepath, embedding_matrix)
        embedding_matrix = embedding_matrix.to(self.configuration['device'])
        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size(namespace='tokens'),
            embedding_dim=embedding_dim,
            padding_index=0,
            vocab_namespace='tokens',
            trainable=self._is_train_token_embeddings(),
            weight=embedding_matrix)
        # the embedder maps the input tokens to the appropriate embedding matrix
        word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        position_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size(namespace='position'),
            embedding_dim=self._get_position_embeddings_dim(),
            padding_index=0)
        position_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"position": position_embedding},
            # we'll be ignoring masks so we'll need to set this to True
            allow_unmatched_keys=True)

        model_function = self._find_model_function_pure()
        model = model_function(
            word_embedder,
            position_embedder,
            self.distinct_polarities,
            self.vocab,
            self.configuration,
        )
        self._print_args(model)
        model = model.to(self.configuration['device'])
        return model

    def _get_optimizer(self, model):
        _params = filter(lambda p: p.requires_grad, model.parameters())
        return optim.Adam(_params, lr=0.001, weight_decay=0.00001)

    def _get_estimator(self, model):
        USE_GPU = torch.cuda.is_available()
        if USE_GPU:
            gpu_id = self.configuration['gpu_id']
        else:
            gpu_id = -1
        estimator = pytorch_models.SpanBasedModelEstimator(
            self.model,
            self.val_iterator,
            self.distinct_polarities,
            cuda_device=gpu_id,
            configuration=self.configuration)
        return estimator

    def _get_estimate_callback(self, model):
        result = []
        data_type_and_data = {
            'train': self.train_data,
            'dev': self.dev_data,
            'test': self.test_data
        }
        estimator = self._get_estimator(model)
        estimate_callback = allennlp_callback.EstimateCallback(
            data_type_and_data, estimator, self.logger)
        result.append(estimate_callback)
        return result

    def _get_loss_weight_callback(self):
        result = []
        set_loss_weight_callback = allennlp_callback.SetLossWeightCallback(
            self.model,
            self.logger,
            acd_warmup_epoch_num=self._get_acd_warmup_epoch_num())
        result.append(set_loss_weight_callback)
        return result

    def _get_fixed_loss_weight_callback(self,
                                        model,
                                        category_loss_weight=1,
                                        sentiment_loss_weight=1):
        result = []
        fixed_loss_weight_callback = allennlp_callback.FixedLossWeightCallback(
            model,
            self.logger,
            category_loss_weight=category_loss_weight,
            sentiment_loss_weight=sentiment_loss_weight)
        result.append(fixed_loss_weight_callback)
        return result

    def _get_bert_word_embedder(self):
        return None

    def _inner_train(self):
        USE_GPU = torch.cuda.is_available()
        if USE_GPU:
            gpu_id = self.configuration['gpu_id']
        else:
            gpu_id = -1

        self.model = self._find_model_function()
        estimator = self._get_estimator(self.model)
        callbacks = self._get_estimate_callback(self.model)
        validation_metric = '+accuracy'
        self.logger.info('validation_metric: %s' % validation_metric)
        optimizer = self._get_optimizer(self.model)
        trainer = Trainer(model=self.model,
                          optimizer=optimizer,
                          iterator=self.iterator,
                          train_dataset=self.train_data,
                          validation_dataset=self.dev_data,
                          cuda_device=gpu_id,
                          num_epochs=self.configuration['epochs'],
                          validation_metric=validation_metric,
                          validation_iterator=self.val_iterator,
                          serialization_dir=self.model_dir,
                          patience=self.configuration['patience'],
                          callbacks=callbacks,
                          num_serialized_models_to_keep=0,
                          early_stopping_by_batch=self.
                          configuration['early_stopping_by_batch'],
                          estimator=estimator,
                          grad_clipping=5)
        metrics = trainer.train()
        self.logger.info('metrics: %s' % str(metrics))

    def _save_model(self):
        torch.save(self.model, self.best_model_filepath)

    def _load_model(self):
        if torch.cuda.is_available():
            self.model = torch.load(self.best_model_filepath)
        else:
            self.model = torch.load(self.best_model_filepath,
                                    map_location=torch.device('cpu'))
        self.model.configuration = self.configuration

    def evaluate(self):
        USE_GPU = torch.cuda.is_available()
        if USE_GPU:
            gpu_id = self.configuration['gpu_id']
        else:
            gpu_id = -1
        estimator = pytorch_models.SpanBasedModelEstimator(
            self.model,
            self.val_iterator,
            self.distinct_polarities,
            configuration=self.configuration,
            cuda_device=gpu_id)

        data_type_and_data = {
            'train': self.train_data,
            'dev': self.dev_data,
            'test': self.test_data
        }
        if self.hard_test_data:
            data_type_and_data['hard_test'] = self.hard_test_data
        for data_type, data in data_type_and_data.items():
            result = estimator.estimate(data)
            self.logger.info('data_type: %s result: %s' % (data_type, result))

    def predict_backup(self):
        USE_GPU = torch.cuda.is_available()
        if USE_GPU:
            gpu_id = self.configuration['gpu_id']
        else:
            gpu_id = -1
        predictor = pytorch_models.SpanBasedModelPredictor(
            self.model,
            self.val_iterator,
            self.distinct_polarities,
            configuration=self.configuration,
            cuda_device=gpu_id)

        data_type_and_data = {
            # 'train': self.train_data,
            # 'dev': self.dev_data,
            'test': self.test_data
        }
        if self.hard_test_data:
            data_type_and_data['hard_test'] = self.hard_test_data
        for data_type, data_temp in data_type_and_data.items():
            # for multi
            data = []
            for instance in data_temp:
                aspect_terms = instance.fields['sample'].metadata[
                    'aspect_terms']
                if len(aspect_terms) != 2:
                    continue
                data.append(instance)
                # text = instance.fields['sample'].metadata['text']
                # # i love the keyboard and the screen. ()
                # # The best thing about this laptop is the price along with some of the newer features.
                # if 'that any existing MagSafe' in text:
                #     data.append(instance)
                #     break
            result = predictor.predict(data)
            correct_sentences = []
            for e in result:
                sentiment_outputs_for_aspect_terms = e[
                    'sentiment_outputs_for_aspect_terms']
                aspect_terms = e['aspect_terms']
                for i in range(len(aspect_terms)):
                    if aspect_terms[
                            i].polarity != sentiment_outputs_for_aspect_terms[
                                i][1]:
                        break
                else:
                    correct_sentences.append(e['text'])

            file_utils.write_lines(correct_sentences,
                                   'd:/correct_sentences.txt')

            self.logger.info('data_type: %s result: %s' % (data_type, result))

    def predict(self):
        USE_GPU = torch.cuda.is_available()
        if USE_GPU:
            gpu_id = self.configuration['gpu_id']
        else:
            gpu_id = -1
        predictor = pytorch_models.SpanBasedModelPredictor(
            self.model,
            self.val_iterator,
            self.distinct_polarities,
            configuration=self.configuration,
            cuda_device=gpu_id)

        data_type_and_data = {
            # 'train': self.train_data,
            # 'dev': self.dev_data,
            'test': self.test_data
        }
        if self.hard_test_data:
            data_type_and_data['hard_test'] = self.hard_test_data
        for data_type, data_temp in data_type_and_data.items():
            # for multi
            correct_sentences = file_utils.read_all_lines(
                'd:/correct_sentences.txt')
            for sentence in correct_sentences:
                data = []
                for instance in data_temp:
                    text = instance.fields['sample'].metadata['text']
                    # i love the keyboard and the screen. ()
                    # The best thing about this laptop is the price along with some of the newer features.
                    if sentence in text:
                        data.append(instance)
                result = predictor.predict(data)
                if result[0]['aspect_terms'][0].polarity == 'neutral' or result[
                        1]['aspect_terms'][0].polarity == 'neutral':
                    continue
                for e in result:
                    sentiment_outputs_for_aspect_terms = e[
                        'sentiment_outputs_for_aspect_terms']
                    aspect_terms = e['aspect_terms']
                    for i in range(len(aspect_terms)):
                        if aspect_terms[i].polarity != 'neutral' and aspect_terms[
                                i].polarity != sentiment_outputs_for_aspect_terms[
                                    i][1]:
                            print()

    def predict_test(self, output_filepath):
        USE_GPU = torch.cuda.is_available()
        if USE_GPU:
            gpu_id = self.configuration['gpu_id']
        else:
            gpu_id = -1
        predictor = pytorch_models.SpanBasedModelPredictor(
            self.model,
            self.val_iterator,
            self.distinct_polarities,
            configuration=self.configuration,
            cuda_device=gpu_id)

        data = self.test_data
        result = predictor.predict(data)
        output_lines = []
        for sample in result:
            text = sample['text']
            words_for_test = text.split(' ')
            aspect_terms = sample['aspect_terms']
            word_indices_of_aspect_terms = []
            for aspect_term in aspect_terms:
                from_index = aspect_term.from_index
                term = aspect_term.term
                start_index = 0
                if from_index > 0:
                    start_index = len(text[:from_index].strip().split(' '))
                term_length = len(term.split(' '))
                word_indices_of_aspect_terms.append(
                    [start_index, start_index + term_length])
            sentiment_outputs_for_aspect_terms = sample[
                'sentiment_outputs_for_aspect_terms']
            for i in range(len(word_indices_of_aspect_terms)):
                term = aspect_terms[i].term
                word_indices = word_indices_of_aspect_terms[i]
                if term != ' '.join(
                        words_for_test[word_indices[0]:word_indices[1]]):
                    print('error')
                sentiment = sentiment_outputs_for_aspect_terms[i][1]
                output_line = json.dumps({
                    'text':
                    text,
                    'aspect_term':
                    '%s-%d-%d' % (term, word_indices[0], word_indices[1]),
                    'sentiment':
                    sentiment
                })
                output_lines.append(output_line)
        file_utils.write_lines(output_lines, output_filepath)
Exemple #19
0
def predict(cuda_device: int,
            char_encoder: str,
            data_dir: Path,
            glove_path: Path,
            temp_dir: Path,
            random_seed: int = 13370,
            numpy_seed: int = 1337,
            torch_seed: int = 133) -> List[Tuple[float, float, str]]:
    '''
    This allows you to train an NER model that has either a CNN character 
    encoder or LSTM based on the `char_encoder` argument. The encoded 
    characters are then combined with 100D Glove vectors and put through 
    a Bi-Directional LSTM.

    This is based on the following two papers:
    
    1. CNN character encoder version `Ma and Hovy \
       <https://arxiv.org/abs/1603.01354>`_
    2. LSTM character encoder version `Lample et al. \
       <https://arxiv.org/abs/1603.01360>`_

    :param cuda_device: Whether to use GPU or CPU, CPU = -1, GPU = 0
    :param char_encoder: Whether to use an LSTM or CNN. Acceptable values are: 
                         1. lstm, 2. cnn
    :param data_dir: A file path to a directory that contains three files: 
                     1. train.txt, 2. dev.txt, 3. test.txt that are the 
                     train, dev, and test files respectively in CONLL 2003 
                     format where the NER labels are in BIO format.
    :param glove_path: A file path to the `Glove 6 billion word vectors 100D \
                       <https://nlp.stanford.edu/projects/glove/>`_
    :returns: The results as a list of tuples which are 
              (dev f1 score, test f1 score, char encoder) where the list 
              represents a different trained model using the same train, dev, 
              and test split but different random seed.
    '''
    #
    # The dataset we are using has already been formatted from IOB1 to BIO
    # When reading the dataset state the coding is the orignal as this will not
    # affect the labels i.e. the labels and schema is not checked.

    label_encoding = 'BIO'
    constrain_crf_decoding = True
    dropout = 0.5

    char_embedding_dim = 30
    cnn_window_size = (3, )
    cnn_filters = 50
    cnn_output_dim = len(cnn_window_size) * cnn_filters

    lstm_char_dim = 25
    lstm_char_output_dim = lstm_char_dim * 2

    word_embedding_dim = 100
    # LSTM size is that of Ma and Hovy
    lstm_dim = 100

    # Dropout applies dropout after the encoded text and after the word embedding.

    #tensorboard_dir = Path('..', 'tensorboard ner')
    #tensorboard_dir.mkdir(parents=True, exist_ok=True)

    #train_log = SummaryWriter(Path(tensorboard_dir, "log", "train"))
    #validation_log = SummaryWriter(Path(tensorboard_dir, "log", "validation"))

    train_fp = Path(data_dir, 'train.txt')
    dev_fp = Path(data_dir, 'dev.txt')
    test_fp = Path(data_dir, 'test.txt')
    result_fp = Path(data_dir, 'results.json')
    result_data = []
    if result_fp.exists():
        with result_fp.open('r') as json_file:
            result_data = json.load(json_file)

    indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens',
                                       lowercase_tokens=True),
        'chars': TokenCharactersIndexer(namespace='token_characters')
    }

    conll_reader = Conll2003DatasetReader(token_indexers=indexers)
    train_dataset = conll_reader.read(cached_path(train_fp))
    dev_dataset = conll_reader.read(cached_path(dev_fp))
    test_dataset = conll_reader.read(cached_path(test_fp))

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset +
                                      test_dataset)

    char_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size("token_characters"),
        embedding_dim=char_embedding_dim)

    if char_encoder.strip().lower() == 'lstm':
        character_lstm = torch.nn.LSTM(char_embedding_dim,
                                       lstm_char_dim,
                                       batch_first=True,
                                       bidirectional=True)
        character_lstm_wrapper = PytorchSeq2VecWrapper(character_lstm)
        token_character_encoder = TokenCharactersEncoder(
            embedding=char_embedding, encoder=character_lstm_wrapper)
        total_char_embedding_dim = lstm_char_output_dim
    elif char_encoder.strip().lower() == 'cnn':
        character_cnn = CnnEncoder(embedding_dim=char_embedding_dim,
                                   num_filters=cnn_filters,
                                   ngram_filter_sizes=cnn_window_size,
                                   output_dim=cnn_output_dim)
        token_character_encoder = TokenCharactersEncoder(
            embedding=char_embedding, encoder=character_cnn)
        total_char_embedding_dim = cnn_output_dim
    else:
        raise ValueError('The Character encoder can only be `lstm` or `cnn` '
                         f'and not {char_encoder}')

    glove_path = cached_path(glove_path)
    glove_100_weights = _read_pretrained_embeddings_file(
        glove_path, word_embedding_dim, vocab, 'tokens')
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=word_embedding_dim,
                                weight=glove_100_weights)

    word_embeddings = BasicTextFieldEmbedder({
        "tokens": token_embedding,
        "chars": token_character_encoder
    })

    total_embedding_dim = word_embedding_dim + total_char_embedding_dim
    lstm = torch.nn.LSTM(total_embedding_dim,
                         lstm_dim,
                         batch_first=True,
                         bidirectional=True)
    lstm_wrapper = PytorchSeq2SeqWrapper(lstm)

    model = CrfTagger(vocab,
                      word_embeddings,
                      lstm_wrapper,
                      label_encoding=label_encoding,
                      dropout=dropout,
                      constrain_crf_decoding=constrain_crf_decoding)

    optimizer = optim.SGD(model.parameters(), lr=0.015, weight_decay=1e-8)
    schedule = LearningRateWithoutMetricsWrapper(
        torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9524))
    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    temp_dir_fp = str(temp_dir.resolve())
    temp_folder_path = tempfile.mkdtemp(dir=temp_dir_fp)

    set_random_env(cuda_device, random_seed, numpy_seed, torch_seed)
    trainer = Trainer(model=model,
                      grad_clipping=5.0,
                      learning_rate_scheduler=schedule,
                      serialization_dir=temp_folder_path,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      shuffle=True,
                      cuda_device=cuda_device,
                      patience=5,
                      num_epochs=1000)

    #trainer._tensorboard = TensorboardWriter(train_log=train_log,
    #                                        validation_log=validation_log)
    interesting_metrics = trainer.train()
    best_model_weights = Path(temp_folder_path, 'best.th')
    best_model_state = torch.load(best_model_weights)
    model.load_state_dict(best_model_state)
    test_result = evaluate(model, test_dataset, iterator, cuda_device)
    dev_result = evaluate(model, dev_dataset, iterator, cuda_device)
    test_f1 = test_result['f1-measure-overall']
    dev_f1 = dev_result['f1-measure-overall']
    result_data.append((dev_f1, test_f1, char_encoder))

    with result_fp.open('w+') as json_file:
        json.dump(result_data, json_file)
    print(f'{interesting_metrics}')
    return result_data
Exemple #20
0
def main():
    parser = argparse.ArgumentParser(description='Evidence oracle QA')
    parser.add_argument('--epochs', type=int, default=5,
                        help='upper epoch limit (default: 5)')
    parser.add_argument('--patience', type=int, default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='batch size (default: 32)')
    parser.add_argument('--model_name', type=str, default='sentence_oracle_bert',
                        help='model name (default: sentence_oracle_bert)')
    parser.add_argument('--tunable', action='store_true',
                        help='tune the underlying embedding model (default: False)')
    parser.add_argument('--ev_type', type=str, default='sentence',
                        help='how to train the oracle - sentence or full (evidence) (default: sentence)')
    args = parser.parse_args()

    if args.ev_type == 'sentence':
        train = pickle.load(open('data/oracle_train.p', 'rb'))
        valid = pickle.load(open('data/oracle_val.p', 'rb'))
        test = pickle.load(open('data/oracle_test.p', 'rb'))
    elif args.ev_type == 'full':
        train = pickle.load(open('data/oracle_full_train.p', 'rb'))
        valid = pickle.load(open('data/oracle_full_val.p', 'rb'))
        test = pickle.load(open('data/oracle_full_test.p', 'rb'))
    else:
        print('ev_type should be either sentence or full')
        return

    bert_token_indexer = {'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)}

    pipeline_train = pickle.load(open('data/train_instances.p', 'rb'))
    pipeline_val = pickle.load(open('data/val_instances.p', 'rb'))
    pipeline_test = pickle.load(open('data/test_instances.p', 'rb'))

    pipeline_reader = PipelineDatasetReader(bert_token_indexer)
    p_train = pipeline_reader.read(pipeline_train)
    p_val = pipeline_reader.read(pipeline_val)
    p_test = pipeline_reader.read(pipeline_test)

    p_vocab = Vocabulary.from_instances(p_train + p_val + p_test)

    reader = EIDatasetReader(bert_token_indexer)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    bert_token_embedding = PretrainedBertEmbedder(
        'scibert/weights.tar.gz', requires_grad=args.tunable
    )

    word_embeddings = BasicTextFieldEmbedder(
        {"bert": bert_token_embedding},
        {"bert": ['bert']},
        allow_unmatched_keys=True
    )

    model = Oracle(word_embeddings, p_vocab)

    cuda_device = list(range(torch.cuda.device_count()))

    if torch.cuda.is_available():
        model = model.cuda()
    else:
        cuda_device = -1

    t_total = len(train_data) // args.epochs

    optimizer = BertAdam(model.parameters(), lr=1e-5, warmup=0.05, t_total=t_total)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('comb_prompt_ev', 'num_tokens')],
                              padding_noise=0.1)
    iterator.index_with(p_vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=valid_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    if cuda_device != -1:
        cuda_device = 0
    test_metrics = evaluate(trainer.model, test_data, iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
Exemple #21
0
def main(args):
    fix_seed()
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model, args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name, args.max_len, skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)

    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab
    if args.vocab_path:
        vocab = Vocabulary.from_files(args.vocab_path)
    else:
        vocab = Vocabulary.from_instances(train_data,
                                          max_vocab_size={'tokens': 30000,
                                                          'labels': args.target_vocab_size,
                                                          'd_tags': 2},
                                          tokens_to_add=tokens_to_add)
    vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))

    print("Data is loaded")
    model = get_model(weights_name, vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:
        model.load_state_dict(torch.load(os.path.join(args.pretrain_folder, args.pretrain + '.th')))

    model = model.to(device)

    print("Model is set")

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.1, patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[("tokens", "num_tokens")],
                              biggest_batch_first=True,
                              max_instances_in_memory=args.batch_size * 20000,
                              instances_per_epoch=instances_per_epoch,
                              )
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      serialization_dir=args.model_dir,
                      patience=args.patience,
                      num_epochs=args.n_epoch,
                      cuda_device=cuda_device,
                      shuffle=False,
                      accumulated_batch_count=args.accumulation_size,
                      cold_step_count=args.cold_steps_count,
                      cold_lr=args.cold_lr,
                      cuda_verbose_step=int(args.cuda_verbose_steps)
                      if args.cuda_verbose_steps else None
                      )
    print("Start training")
    trainer.train()

    # Here's how to save the model.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped")
Exemple #22
0
# Now names
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=WORD_EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding, "token_characters" : character_embeddings})
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = NamesClassifier(word_embeddings, lstm, vocab)

if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1

# Train the model - 30 epochs seem to give a pretty good baseline accuracy - 0.7 val accuracy
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens"), ("token_characters", "num_token_characters")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_set,
                  validation_dataset=val_set,
                  patience=10,
                  num_epochs=2, 
                  cuda_device=cuda_device)
trainer.train()

# Manually test predictions
from allennlp.predictors import Predictor

class OwnPredictor(Predictor):
Exemple #23
0
      sum(p.numel() for p in model.parameters() if p.requires_grad))
print('Network:', model)

#
# train
#
iterBatchSize = 64
_triple_loader = IrTripleDatasetReader(
    lazy=True,
    max_doc_length=180,
    max_query_length=30,
    tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()
                            ))  # already spacy tokenized, so that it is faster

_iterator = BucketIterator(batch_size=iterBatchSize,
                           sorting_keys=[("doc_pos_tokens", "num_tokens"),
                                         ("doc_neg_tokens", "num_tokens")])

_iterator.index_with(vocab)

# Create a folder which will store the model state, and the results: model name + current timestamp without seconds
from datetime import datetime
import os

dt_string = datetime.now().strftime("%d-%m-%Y-%H_%M")
newFolder = str(config["model"]) + "_" + dt_string + '/'
resultFolder = pathPrefix + '/air_results/' + newFolder
os.mkdir(resultFolder)

# %%

# read data
# token_indexer = SingleIdTokenIndexer()
token_indexer = ELMoTokenCharactersIndexer()
reader = JigsawDatasetReader(tokenizer=tokenizer, token_indexers={"tokens": token_indexer})
DATA_ROOT = Path("data") / "jigsaw"
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"])
val_ds = None

# prepare vocab
# vocab = Vocabulary.from_instances(train_ds, max_vocab_size=config.max_vocab_size)
vocab = Vocabulary()

# prepare iterator
iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)

# test data
# batch = next(iter(iterator(train_ds)))
# print(batch)
# print(batch.keys())
# print(batch["tokens"]["tokens"].shape)


class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=len(label_cols)):
        super(BaselineModel, self).__init__(vocab)
        self.word_embeddings = word_embeddings
Exemple #25
0
def main():
    ###############################################################################################
    prepare_global_logging(serialization_dir=args.serialization_dir,
                           file_friendly_logging=False)
    #DATA
    reader = MathDatasetReader(source_tokenizer=CharacterTokenizer(),
                               target_tokenizer=CharacterTokenizer(),
                               source_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target_token_indexers={
                                   'tokens':
                                   SingleIdTokenIndexer(namespace='tokens')
                               },
                               target=False,
                               label=True,
                               lazy=True)
    train_data = reader.read("../../datasets/math/label-data/train-all")
    # val_data = reader.read("../../datasets/math/label-data/interpolate")

    vocab = Vocabulary()
    vocab.add_tokens_to_namespace([
        START_SYMBOL, END_SYMBOL, ' ', '!', "'", '(', ')', '*', '+', ',', '-',
        '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<',
        '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y',
        'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
        '}'
    ],
                                  namespace='tokens')
    vocab.add_tokens_to_namespace([
        'algebra', 'arithmetic', 'calculus', 'comparison', 'measurement',
        'numbers', 'polynomials', 'probability'
    ],
                                  namespace='labels')

    # MODEL
    embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                          embedding_dim=EMBEDDING_DIM)
    source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

    if args.model == 'lstm':
        encoder = PytorchSeq2VecWrapper(
            torch.nn.LSTM(EMBEDDING_DIM,
                          HIDDEN_DIM,
                          num_layers=NUM_LAYERS,
                          batch_first=True))
    elif args.model == 'cnn':
        encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             output_dim=HIDDEN_DIM)
    else:
        raise NotImplemented("The classifier model should be LSTM or CNN")

    model = TextClassifier(
        vocab=vocab,
        source_text_embedder=source_embedder,
        encoder=encoder,
    )
    model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(0.9, 0.995),
                           eps=1e-6)

    train_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                    max_instances_in_memory=1024,
                                    sorting_keys=[("source_tokens",
                                                   "num_tokens")])
    train_iterator = MultiprocessIterator(train_iterator, num_workers=16)
    train_iterator.index_with(vocab)

    val_iterator = BucketIterator(batch_size=BATCH_SIZE,
                                  max_instances_in_memory=1024,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
    val_iterator = MultiprocessIterator(val_iterator, num_workers=16)
    val_iterator.index_with(vocab)
    #pdb.set_trace()

    LR_SCHEDULER = {"type": "exponential", "gamma": 0.5, "last_epoch": -1}
    lr_scheduler = LearningRateScheduler.from_params(optimizer,
                                                     Params(LR_SCHEDULER))

    # TRAIN
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=train_iterator,
                      validation_iterator=None,
                      train_dataset=train_data,
                      validation_dataset=None,
                      patience=None,
                      shuffle=True,
                      num_epochs=1,
                      summary_interval=100,
                      learning_rate_scheduler=lr_scheduler,
                      cuda_device=CUDA_DEVICES,
                      grad_norm=5,
                      grad_clipping=5,
                      model_save_interval=600,
                      serialization_dir=args.serialization_dir,
                      keep_serialized_model_every_num_seconds=3600,
                      should_log_parameter_statistics=True,
                      should_log_learning_rate=True)
    trainer.train()
    ### Check Cuda
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    ### Train Model

    optimizer = optim.SGD(
        model.parameters(), lr=0.01,
        momentum=0.9)  #optim.Adam(same), play with lr and momentum (SGD ony)

    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[('text', 'num_tokens')
                                            ])  # 32 speed - 64 precission

    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=100,
                      num_epochs=10000,
                      cuda_device=cuda_device
                      )  #Select patience and play with number of epochs

    results = trainer.train()
def main():
    parser = argparse.ArgumentParser(
        description='Evidence Inference experiments')
    parser.add_argument('--cuda_device',
                        type=int,
                        default=0,
                        help='GPU number (default: 0)')
    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        help='upper epoch limit (default: 2)')
    parser.add_argument('--patience',
                        type=int,
                        default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=8,
                        help='batch size (default: 8)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout for the model (default: 0.2)')
    parser.add_argument('--emb_size',
                        type=int,
                        default=256,
                        help='elmo embeddings size (default: 256)')
    parser.add_argument('--model_name',
                        type=str,
                        default='attention',
                        help='model name (default: attention)')
    parser.add_argument(
        '--tunable',
        action='store_true',
        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    processed_annotations = pickle.load(open('data/data/p_annotations.p',
                                             'rb'))

    prompts = pd.read_csv('data/data/prompts_merged.csv')

    prompts_dictionary = {}
    for index, row in prompts.iterrows():
        prompts_dictionary[row['PromptID']] = [
            row['Outcome'], row['Intervention'], row['Comparator']
        ]

    for article_key in processed_annotations:
        for article_item in processed_annotations[article_key]:
            article_item += prompts_dictionary[article_item[-1]]

    train = []
    valid = []
    test = []

    with open('data/splits/train_article_ids.txt') as train_file:
        for line in train_file:
            train.append(int(line.strip()))

    with open('data/splits/validation_article_ids.txt') as valid_file:
        for line in valid_file:
            valid.append(int(line.strip()))

    with open('data/splits/test_article_ids.txt') as test_file:
        for line in test_file:
            test.append(int(line.strip()))

    elmo_token_indexer = {
        'elmo': ELMoTokenCharactersIndexer(),
        'tokens': SingleIdTokenIndexer()
    }

    reader = EIDatasetReader(elmo_token_indexer, processed_annotations)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    vocab = Vocabulary.from_instances(train_data + valid_data + test_data)

    urls = [
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_options.json',
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_weights.hdf5'
    ]

    elmo_token_embedding = ElmoTokenEmbedder(urls[0],
                                             urls[1],
                                             dropout=args.dropout,
                                             requires_grad=args.tunable,
                                             projection_dim=args.emb_size)

    word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding},
                                             allow_unmatched_keys=True)

    model = Baseline(word_embeddings, vocab)

    global cuda_device
    cuda_device = args.cuda_device

    if torch.cuda.is_available():
        logger.info('Running on GPU')
        model = model.cuda(cuda_device)
    else:
        logger.info('Running on CPU')
        cuda_device = -1

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('article', 'num_fields')],
                              padding_noise=0.1)
    iterator.index_with(vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=test_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    test_metrics = evaluate(trainer.model,
                            test_data,
                            iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder},
                                         allow_unmatched_keys=True)
if torch.cuda.is_available():
    cuda_device = 0
else:
    cuda_device = -1

print(cuda_device)
mymodel = BERTWino(word_embeddings, vocab, cuda_device)
if cuda_device >= 0:
    mymodel = mymodel.cuda(cuda_device)

optimizer = optim.Adam(mymodel.parameters(), lr=LR)

iterator = BucketIterator(batch_size=BATCH,
                          sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)

trainer = Trainer(model=mymodel,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  num_epochs=EPOCHS,
                  cuda_device=cuda_device)
indexer = PretrainedBertIndexer(pretrained_model="bert-base-cased",
                                do_lowercase=False
                                #max_pieces=config.max_seq_length
                                )

trainer.train()
Exemple #29
0
def main():
    args = get_args()

    # TODO 增加char n-gram embeddings
    if args.embedding == 'elmo':
        token_indexer = ELMoTokenCharactersIndexer()
    else:
        token_indexer = SingleIdTokenIndexer()
    # Kaggle的多标签“恶意评论分类挑战
    reader = JigsawDatasetReader(tokenizer=None,
                                 token_indexers={"tokens": token_indexer},
                                 max_seq_len=200)

    dataset_root = Path('../../data/jigsaw')
    train_dataset, dev_dataset = (reader.read(
        dataset_root / fname) for fname in ["train.csv", "test_proced.csv"])

    print(
        f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}"
    )

    # 建立词汇表,从数据集中建立
    # if args.embedding == 'elmo':
    #     vocab = Vocabulary()
    # else:
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

    vocab_dim = vocab.get_vocab_size('tokens')
    print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim)

    # 建立token embedding
    token_embedding = None
    print(f"embedding dim: {args.embedding_dim}")
    if args.embedding == 'random':
        token_embedding = Embedding(num_embeddings=vocab_dim,
                                    embedding_dim=args.embedding_dim)
    elif args.embedding == 'glove':
        glove_embeddings_file = '~/nlp/pretrainedEmbeddings/glove/glove.6B.100d.txt'
        token_embedding = Embedding.from_params(vocab=vocab,
                                                params=Params({
                                                    'pretrained_file':
                                                    glove_embeddings_file,
                                                    'embedding_dim':
                                                    args.embedding_dim,
                                                    'trainable':
                                                    False
                                                }))
    elif args.embedding == 'elmo':
        # pretrained elmo LM model, transformed from bilm-tf with dump_weights in bin/training.py
        options_file = '~/nlp/pretrainedEmbeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json'
        weight_file = '~/nlp/pretrainedEmbeddings/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

        token_embedding = ElmoTokenEmbedder(options_file,
                                            weight_file,
                                            requires_grad=True,
                                            do_layer_norm=False)

    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    if args.embedding == 'elmo':
        args.embedding_dim = word_embeddings.get_output_dim()

    # 建立encoder seq2vec
    if args.encoder == 'lstm':
        hidden_dim = 256
        encoder = PytorchSeq2VecWrapper(
            torch.nn.LSTM(args.embedding_dim,
                          hidden_dim,
                          bidirectional=True,
                          batch_first=True))
    elif args.encoder == 'cnn':
        encoder = CnnEncoder(
            embedding_dim=args.embedding_dim,
            num_filters=128,
            ngram_filter_sizes=(2, 3, 4, 5, 6, 7),
        )
    else:
        encoder = None

    # 建立 主分类网络
    if args.network is None:
        model = MultiLabelClassifier(
            word_embeddings,
            0.5,
            encoder,
            0.2,
            vocab=vocab,
            out_dim=6,
        )
    elif args.network == 'bcn':
        # TODO 转换成code line 形式 实例化分类器网络
        bcn_params = {
            "text_field_embedder": {
                "token_embedders": {
                    "tokens": {
                        "pretrained_file":
                        "/home/lirui/nlp/document-qa/data/glove/glove.840B.300d.txt",
                        "type": "embedding",
                        "embedding_dim": 300,
                        "trainable": False
                    }
                }
            },
            "embedding_dropout": 0.5,
            "pre_encode_feedforward": {
                "input_dim": 300,
                "num_layers": 1,
                "hidden_dims": [300],
                "activations": ["relu"],
                "dropout": [0.25]
            },
            "encoder": {
                "type": "lstm",
                "input_size": 300,
                "hidden_size": 300,
                "num_layers": 1,
                "bidirectional": True
            },
            "integrator": {
                "type": "lstm",
                "input_size": 1800,
                "hidden_size": 300,
                "num_layers": 1,
                "bidirectional": True
            },
            "integrator_dropout": 0.1,
            # "elmo": {
            #     "options_file": "/home/lirui/nlp/learning_allenNLP/learning_allennlp/models/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json",
            #     "weight_file": "/home/lirui/nlp/learning_allenNLP/learning_allennlp/models/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5",
            #     "do_layer_norm": False,
            #     "dropout": 0.0,
            #     "num_output_representations": 1
            # },
            # "use_input_elmo": False,
            # "use_integrator_output_elmo": False,
            "output_layer": {
                "input_dim": 2400,
                "num_layers": 3,
                "output_dims": [1200, 600, 5],
                "pool_sizes": 4,
                "dropout": [0.2, 0.3, 0.0]
            }
        }
        model = BiattentiveClassificationNetwork.from_params(
            vocab, params=Params(bcn_params))

    # 训练参数
    gpu_id = args.gpu_id if torch.cuda.is_available() else -1
    if gpu_id > -1: model.cuda(gpu_id)

    # 构建迭代器,并为迭代器指定vocab
    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=dev_dataset,
        grad_norm=5.0,
        # validation_metric='+accuracy',
        cuda_device=gpu_id,
        patience=5,
        num_epochs=args.n_epochs)
    trainer.train()
def main():
    parser = argparse.ArgumentParser(
        description='Evidence sentence classifier')
    parser.add_argument('--epochs',
                        type=int,
                        default=5,
                        help='upper epoch limit (default: 5)')
    parser.add_argument('--patience',
                        type=int,
                        default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=8,
                        help='batch size (default: 8)')
    parser.add_argument(
        '--loss',
        type=str,
        default='hinge',
        help=
        'loss function to train the model - choose bce or hinge (default: hinge)'
    )
    parser.add_argument(
        '--hinge_margin',
        type=float,
        default=0.5,
        help='the margin for the hinge loss, if used (default: 0.5)')
    parser.add_argument('--model_name',
                        type=str,
                        default='ev_classifier_bert',
                        help='model name (default: ev_classifier_bert)')
    parser.add_argument(
        '--tunable',
        action='store_true',
        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    if args.loss not in ['bce', 'hinge']:
        print('Loss must be bce or hinge')
        return

    bert_token_indexer = {
        'bert': PretrainedBertIndexer('scibert/vocab.txt', max_pieces=512)
    }

    pipeline_train = pickle.load(open('data/train_instances.p', 'rb'))
    pipeline_val = pickle.load(open('data/val_instances.p', 'rb'))
    pipeline_test = pickle.load(open('data/test_instances.p', 'rb'))

    pipeline_reader = PipelineDatasetReader(bert_token_indexer)
    p_train = pipeline_reader.read(pipeline_train)
    p_val = pipeline_reader.read(pipeline_val)
    p_test = pipeline_reader.read(pipeline_test)

    p_vocab = Vocabulary.from_instances(p_train + p_val + p_test)

    classifier_train = pickle.load(open('data/classifier_train.p', 'rb'))
    classifier_val = pickle.load(open('data/classifier_val.p', 'rb'))

    reader = EvidenceDatasetReader(bert_token_indexer)
    train_data = reader.read(classifier_train)
    valid_data = reader.read(classifier_val)

    bert_token_embedding = PretrainedBertEmbedder('scibert/weights.tar.gz',
                                                  requires_grad=args.tunable)

    word_embeddings = BasicTextFieldEmbedder({"bert": bert_token_embedding},
                                             {"bert": ['bert']},
                                             allow_unmatched_keys=True)

    model = Classifier(word_embeddings=word_embeddings,
                       vocab=p_vocab,
                       loss=args.loss,
                       hinge_margin=args.hinge_margin)

    cuda_device = list(range(torch.cuda.device_count()))

    if torch.cuda.is_available():
        model = model.cuda()
    else:
        cuda_device = -1

    t_total = len(train_data) // args.epochs

    optimizer = BertAdam(model.parameters(),
                         lr=2e-5,
                         warmup=0.1,
                         t_total=t_total)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('comb_evidence', 'num_tokens')],
                              padding_noise=0.1,
                              biggest_batch_first=True)
    iterator.index_with(p_vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_data,
        validation_dataset=valid_data,
        patience=args.patience,
        validation_metric='+accuracy',
        num_epochs=args.epochs,
        cuda_device=cuda_device,
        # learning_rate_scheduler=scheduler,
        serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))
def main():
    parser = utils.opt_parser.get_trainer_opt_parser()
    parser.add_argument('models',
                        nargs='*',
                        help='pretrained models for the same setting')
    parser.add_argument('--test', action="store_true", help='use testing mode')
    parser.add_argument('--emb-dim',
                        type=int,
                        help='basic embedding dimension')
    parser.add_argument('--act-max-layer',
                        type=int,
                        help='maximum number of stacked layers')
    parser.add_argument('--use-act',
                        action="store_true",
                        help='Use adaptive computation time for decoder')
    parser.add_argument('--act-loss-weight',
                        type=float,
                        help="the loss of the act weights")

    parser.add_argument('--enc-layers', type=int, help="layers in encoder")
    parser.add_argument('--act-mode',
                        choices=['basic', 'random', 'mean_field'])
    parser.add_argument('--encoder', choices=['transformer', 'lstm', 'bilstm'])
    parser.add_argument(
        '--decoder',
        choices=['lstm', 'rnn', 'gru', 'ind_rnn', 'n_lstm', 'n_gru'],
    )
    parser.add_argument('--dec-cell-height',
                        type=int,
                        help="the height for n_layer lstm/gru")

    args = parser.parse_args()

    reader = data_adapter.GeoQueryDatasetReader()
    training_set = reader.read(config.DATASETS[args.dataset].train_path)
    try:
        validation_set = reader.read(config.DATASETS[args.dataset].dev_path)
    except:
        validation_set = None

    vocab = allennlp.data.Vocabulary.from_instances(training_set)
    if args.epoch:
        config.TRAINING_LIMIT = args.epoch
    if args.device:
        config.DEVICE = args.device
    st_ds_conf = get_updated_settings(args)

    model = get_model(vocab, st_ds_conf)

    if args.models:
        model.load_state_dict(torch.load(args.models[0]))

    if not args.test or not args.models:
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=st_ds_conf['batch_sz'])
        iterator.index_with(vocab)

        optim = torch.optim.Adam(model.parameters(),
                                 lr=config.ADAM_LR,
                                 betas=config.ADAM_BETAS,
                                 eps=config.ADAM_EPS)

        savepath = os.path.join(
            config.SNAPSHOT_PATH, args.dataset, 'ada_trans2seq',
            datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" +
            args.memo)
        if not os.path.exists(savepath):
            os.makedirs(savepath, mode=0o755)

        trainer = allennlp.training.Trainer(
            model=model,
            optimizer=optim,
            iterator=iterator,
            train_dataset=training_set,
            validation_dataset=validation_set,
            serialization_dir=savepath,
            cuda_device=config.DEVICE,
            num_epochs=config.TRAINING_LIMIT,
            grad_clipping=config.GRAD_CLIPPING,
        )

        trainer.train()

    else:
        testing_set = reader.read(config.DATASETS[args.dataset].test_path)
        model.eval()

        if config.DEVICE > -1:
            model = model.cuda(config.DEVICE)

        predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader)

        for instance in tqdm.tqdm(testing_set, total=len(testing_set)):
            print('SRC: ', instance.fields['source_tokens'].tokens)
            print(
                'GOLD:', ' '.join(
                    str(x)
                    for x in instance.fields['target_tokens'].tokens[1:-1]))
            del instance.fields['target_tokens']
            output = predictor.predict_instance(instance)
            print('PRED:', ' '.join(output['predicted_tokens']))
Exemple #32
0
def get_accuracy_detection(model,
                           dev_dataset,
                           vocab,
                           trigger_token_ids=None,
                           snli=False,
                           get_threshold=False,
                           verbose=False):
    """
    When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with
    triggers prepended for the whole dev_dataset.
    """
    model.get_metrics(reset=True)
    model.eval()  # model should be in eval() already, but just in case

    clean_dataset = []
    adv_dataset = []
    for data in dev_dataset:
        fields = {}
        fields['tokens'] = data['tokens']
        fields['label'] = LabelField(0, skip_indexing=True)
        fields['adv'] = LabelField(0, skip_indexing=True)
        clean_dataset.append(Instance(fields))

        fields = {}
        fields['tokens'] = data['tokens']
        fields['label'] = LabelField(1, skip_indexing=True)
        fields['adv'] = LabelField(1, skip_indexing=True)
        adv_dataset.append(Instance(fields))

    if snli:
        iterator = BucketIterator(batch_size=128,
                                  sorting_keys=[("premise", "num_tokens")])
    else:
        iterator = BucketIterator(batch_size=128,
                                  sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    print_string = []
    for idx in trigger_token_ids:
        print_string += [vocab.get_token_from_index(idx)]

    logits = []
    labels = []
    for batch in lazy_groups_of(iterator(clean_dataset,
                                         num_epochs=1,
                                         shuffle=False),
                                group_size=1):
        output = evaluate_batch(model, batch, None, snli)
        logits.append(output['logits'].detach().cpu().numpy())
        labels.append(output['labels'].detach().cpu().numpy())

    for batch in lazy_groups_of(iterator(adv_dataset,
                                         num_epochs=1,
                                         shuffle=False),
                                group_size=1):
        output = evaluate_batch(model, batch, trigger_token_ids, snli)
        logits.append(output['logits'].detach().cpu().numpy())
        labels.append(output['labels'].detach().cpu().numpy())

    logits = np.concatenate(logits, 0)
    labels = np.concatenate(labels, 0)

    num = int(len(labels) / 2)
    if not model.use_cosine:
        if len(logits.shape) > 1:
            preds_int = np.argmax(logits, 1)
            preds_int[preds_int > 0] = 1
            scores = preds_int
        else:
            if "use" in str(type(model)).lower() and model.threshold:
                best_threshold = model.threshold
                preds_int = logits <= best_threshold
                scores = preds_int
                print(logits)
            else:
                fpr, tpr, thresholds = roc_curve(labels, logits)
                gmeans = np.sqrt(tpr * (1 - fpr))
                idx = np.argmax(gmeans)
                best_threshold = thresholds[idx]
                print("threshold", best_threshold)
                print("Median", np.median(logits))
                print("TPR:", tpr[idx])
                print("FPR:", fpr[idx])
                preds_int = logits >= best_threshold
                scores = logits

    else:
        preds_int = (logits >= 0.5)  # need to find threshold
        scores = logits

    acc = accuracy_score(labels, preds_int)
    auc = roc_auc_score(labels, scores)

    remain_clean = np.where(preds_int[:num] == 0)[0]
    remain_adv = np.where(preds_int[num:] == 0)[0]

    return acc, auc, remain_clean, remain_adv
Exemple #33
0
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})


lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab)

if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.SGD(model.parameters(), lr=0.1)


iterator = BucketIterator(batch_size=2)
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=100,
                  cuda_device=cuda_device)
trainer.train()


predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)
Exemple #34
0
def main():
    parser = utils.opt_parser.get_trainer_opt_parser()
    parser.add_argument('models',
                        nargs='*',
                        help='pretrained models for the same setting')
    parser.add_argument('--enc-layers',
                        type=int,
                        default=1,
                        help="encoder layer number defaulted to 1")
    parser.add_argument('--test', action="store_true", help='use testing mode')
    parser.add_argument('--use-dev', action="store_true")

    args = parser.parse_args()

    reader = data_adapter.GeoQueryDatasetReader()
    training_set = reader.read(config.DATASETS[args.dataset].train_path)
    if args.use_dev:
        validation_set = reader.read(config.DATASETS[args.dataset].dev_path)

    vocab = allennlp.data.Vocabulary.from_instances(training_set)
    st_ds_conf = config.SEQ2SEQ_CONF[args.dataset]
    if args.epoch:
        config.TRAINING_LIMIT = args.epoch
    bsz = st_ds_conf['batch_sz']
    emb_sz = st_ds_conf['emb_sz']

    src_embedder = BasicTextFieldEmbedder(
        token_embedders={
            "tokens": Embedding(vocab.get_vocab_size('nltokens'), emb_sz)
        })

    encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(emb_sz,
                      emb_sz,
                      num_layers=args.enc_layers,
                      batch_first=True))

    model = allennlp.models.SimpleSeq2Seq(
        vocab,
        source_embedder=src_embedder,
        encoder=encoder,
        max_decoding_steps=st_ds_conf['max_decoding_len'],
        attention=allennlp.modules.attention.DotProductAttention(),
        beam_size=8,
        target_namespace="lftokens",
        use_bleu=True)

    if args.models:
        model.load_state_dict(torch.load(args.models[0]))

    if not args.test or not args.models:
        iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")
                                                ],
                                  batch_size=bsz)
        iterator.index_with(vocab)

        optim = torch.optim.Adam(model.parameters())

        savepath = os.path.join(
            config.SNAPSHOT_PATH, args.dataset, 'seq2seq',
            datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + "--" +
            args.memo)
        if not os.path.exists(savepath):
            os.makedirs(savepath, mode=0o755)

        trainer = allennlp.training.Trainer(
            model=model,
            optimizer=optim,
            iterator=iterator,
            train_dataset=training_set,
            validation_dataset=validation_set if args.use_dev else None,
            serialization_dir=savepath,
            cuda_device=args.device,
            num_epochs=config.TRAINING_LIMIT,
        )

        trainer.train()

    else:
        testing_set = reader.read(config.DATASETS[args.dataset].test_path)
        model.eval()

        predictor = allennlp.predictors.SimpleSeq2SeqPredictor(model, reader)

        for instance in testing_set:
            print('SRC: ', instance.fields['source_tokens'].tokens)
            print(
                'GOLD:', ' '.join(
                    str(x)
                    for x in instance.fields['target_tokens'].tokens[1:-1]))
            print(
                'PRED:', ' '.join(
                    predictor.predict_instance(instance)['predicted_tokens']))
Exemple #35
0
def train_epoch(model, train_dataset, validation_dataset, batch_size,
                optimizer, log_period, validation_period, save_dir, log_dir,
                cuda):
    """
    Train the model for one epoch.
    """
    # Set model to train mode (turns on dropout and such).
    model.train()
    # Create objects for calculating metrics.
    span_start_accuracy_metric = CategoricalAccuracy()
    span_end_accuracy_metric = CategoricalAccuracy()
    span_accuracy_metric = BooleanAccuracy()
    squad_metrics = SquadEmAndF1()
    # Create Tensorboard logger.
    writer = SummaryWriter(log_dir)

    # Build iterater, and have it bucket batches by passage / question length.
    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("passage", "num_tokens"),
                                            ("question", "num_tokens")])
    num_training_batches = iterator.get_num_batches(train_dataset)
    # Get a generator of train batches.
    train_generator = tqdm(iterator(train_dataset,
                                    num_epochs=1,
                                    cuda_device=0 if cuda else -1),
                           total=num_training_batches,
                           leave=False)
    log_period_losses = 0

    for batch in train_generator:
        # Extract the relevant data from the batch.
        passage = batch["passage"]["tokens"]
        question = batch["question"]["tokens"]
        span_start = batch["span_start"]
        span_end = batch["span_end"]
        metadata = batch.get("metadata", {})

        # Run data through model to get start and end logits.
        output_dict = model(passage, question)
        start_logits = output_dict["start_logits"]
        end_logits = output_dict["end_logits"]
        softmax_start_logits = output_dict["softmax_start_logits"]
        softmax_end_logits = output_dict["softmax_end_logits"]

        # Calculate loss for start and end indices.
        loss = nll_loss(softmax_start_logits, span_start.view(-1))
        loss += nll_loss(softmax_end_logits, span_end.view(-1))
        log_period_losses += loss.data[0]

        # Backprop and take a gradient step.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        model.global_step += 1

        # Calculate categorical span start and end accuracy.
        span_start_accuracy_metric(start_logits, span_start.view(-1))
        span_end_accuracy_metric(end_logits, span_end.view(-1))
        # Compute the best span, and calculate overall span accuracy.
        best_span = get_best_span(start_logits, end_logits)
        span_accuracy_metric(best_span, torch.stack([span_start, span_end],
                                                    -1))
        # Calculate EM and F1 scores
        calculate_em_f1(best_span, metadata, passage.size(0), squad_metrics)

        if model.global_step % log_period == 0:
            # Calculate metrics on train set.
            loss = log_period_losses / log_period
            span_start_accuracy = span_start_accuracy_metric.get_metric(
                reset=True)
            span_end_accuracy = span_end_accuracy_metric.get_metric(reset=True)
            span_accuracy = span_accuracy_metric.get_metric(reset=True)
            em, f1 = squad_metrics.get_metric(reset=True)
            tqdm_description = _make_tqdm_description(loss, em, f1)
            # Log training statistics to progress bar
            train_generator.set_description(tqdm_description)
            # Log training statistics to Tensorboard
            log_to_tensorboard(writer, model.global_step, "train", loss,
                               span_start_accuracy, span_end_accuracy,
                               span_accuracy, em, f1)
            log_period_losses = 0

        if model.global_step % validation_period == 0:
            # Calculate metrics on validation set.
            (loss, span_start_accuracy, span_end_accuracy, span_accuracy, em,
             f1) = evaluate(model, validation_dataset, batch_size, cuda)
            # Save a checkpoint.
            save_name = ("{}_step_{}_loss_{:.3f}_"
                         "em_{:.3f}_f1_{:.3f}.pth".format(
                             model.__class__.__name__, model.global_step, loss,
                             em, f1))
            save_model(model, save_dir, save_name)
            # Log validation statistics to Tensorboard.
            log_to_tensorboard(writer, model.global_step, "validation", loss,
                               span_start_accuracy, span_end_accuracy,
                               span_accuracy, em, f1)
Exemple #36
0
def main():
    print("===experiment starts===")
    exp_start_time = time.time()
    P = Params()
    opts = P.opts
    experiment_logdir = experiment_logger(args=opts)
    print("experiment_logdir:", experiment_logdir)
    P.dump_params(experiment_dir=experiment_logdir)
    cuda_devices = cuda_device_parser(str_ids=opts.cuda_devices)
    TRAIN_WORLDS, DEV_WORLDS, TEST_WORLDS = worlds_loader(args=opts)

    vocab = Vocabulary()
    iterator_for_training_and_evaluating_mentions = BucketIterator(batch_size=opts.batch_size_for_train,
                                                                   sorting_keys=[('context', 'num_tokens')])
    iterator_for_training_and_evaluating_mentions.index_with(vocab)

    embloader = EmbLoader(args=opts)
    emb_mapper, emb_dim, textfieldEmbedder = embloader.emb_returner()
    tokenIndexing = TokenIndexerReturner(args=opts)
    global_tokenizer = tokenIndexing.berttokenizer_returner()
    global_tokenIndexer = tokenIndexing.token_indexer_returner()

    mention_encoder = Pooler_for_mention(args=opts, word_embedder=textfieldEmbedder)
    entity_encoder = Pooler_for_title_and_desc(args=opts, word_embedder=textfieldEmbedder)
    model = Biencoder(args=opts, mention_encoder=mention_encoder, entity_encoder=entity_encoder, vocab=vocab)
    model = model.cuda()
    optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()), lr=opts.lr, eps=opts.epsilon,
                           weight_decay=opts.weight_decay, betas=(opts.beta1, opts.beta2), amsgrad=opts.amsgrad)
    devEvalEpochs = [j for j in range(1, 1000)] if opts.add_hard_negatives else \
                    [1, 3, 5] + [k * 10 for k in range(1, 100)]

    for epoch in range(opts.num_epochs):
        oneep_train_start = time.time()
        for world_name in TRAIN_WORLDS:
            reader = WorldsReader(args=opts, world_name=world_name, token_indexers=global_tokenIndexer, tokenizer=global_tokenizer)

            if opts.add_hard_negatives:
                with torch.no_grad():
                    mention_encoder.eval(), entity_encoder.eval()
                    hardNegativeSearcher = HardNegativesSearcherForEachEpochStart(args=opts, world_name=world_name,
                                                                                  reader=reader,
                                                                                  embedder=textfieldEmbedder,
                                                                                  mention_encoder=mention_encoder,
                                                                                  entity_encoder=entity_encoder, vocab=vocab,
                                                                                  berttokenizer=global_tokenizer,
                                                                                  bertindexer=global_tokenIndexer)
                    hardNegativeSearcher.hardNegativesSearcherandSetter()

            trains = reader.read('train')
            mention_encoder.train(), entity_encoder.train()
            trainer = Trainer(model=model, optimizer=optimizer,
                              iterator=iterator_for_training_and_evaluating_mentions, train_dataset=trains,
                              cuda_device=cuda_devices, num_epochs=1
                              )
            trainer.train()

        if epoch + 1 in devEvalEpochs:
            print('\n===================\n', 'TEMP DEV EVALUATION@ Epoch', epoch + 1,'\n===================\n')
            t_entire_h1c, t_entire_h10c, t_entire_h50c, t_entire_h64c, t_entire_h100c, t_entire_h500c, t_entire_datapoints \
                = oneLineLoaderForDevOrTestEvaluation(
                dev_or_test_flag='dev',
                opts=opts,
                global_tokenIndexer=global_tokenIndexer,
                global_tokenizer=global_tokenizer,
                textfieldEmbedder=textfieldEmbedder,
                mention_encoder=mention_encoder,
                entity_encoder=entity_encoder,
                vocab=vocab,
                experiment_logdir=experiment_logdir,
                finalEvalFlag=0,
                trainEpoch=epoch+1)
            devEvalExperimentEntireDevWorldLog(experiment_logdir, t_entire_h1c, t_entire_h10c,
                                               t_entire_h50c, t_entire_h64c, t_entire_h100c,
                                               t_entire_h500c, t_entire_datapoints,
                                               epoch=epoch)
        oneep_train_end = time.time()
        print('epoch {0} train time'.format(epoch+1), oneep_train_end - oneep_train_start, 'sec')

    print('====training finished=======')

    with torch.no_grad():
        model.eval()
        print('===FINAL Evaluation starts===')

        for dev_or_test_flag in ['dev','test']:
            print('\n===================\n', dev_or_test_flag, 'EVALUATION', '\n===================\n')
            entire_h1c, entire_h10c, entire_h50c, entire_h64c, entire_h100c, entire_h500c, entire_datapoints \
                = oneLineLoaderForDevOrTestEvaluation(dev_or_test_flag=dev_or_test_flag,
                                                      opts=opts,
                                                      global_tokenIndexer=global_tokenIndexer,
                                                      global_tokenizer=global_tokenizer,
                                                      textfieldEmbedder=textfieldEmbedder,
                                                      mention_encoder=mention_encoder,
                                                      entity_encoder=entity_encoder,
                                                      vocab=vocab,
                                                      experiment_logdir=experiment_logdir,
                                                      finalEvalFlag=1,
                                                      trainEpoch=-1)

            dev_or_test_finallog(entire_h1c, entire_h10c, entire_h50c, entire_h64c, entire_h100c,
                                 entire_h500c, entire_datapoints, dev_or_test_flag, experiment_logdir,
                                 )

    exp_end_time = time.time()
    print('===experiment finised', exp_end_time-exp_start_time, 'sec')
    print(experiment_logdir)
Exemple #37
0
from embeddings import get_token_utils, get_embedder
from dataloaders import ReutersDataSetReader, NewsGroupsDataSetReader

token_indexers, tokenizer = get_token_utils()
# reader = ReutersDataSetReader(tokenizer=tokenizer,  # TODO: token_indexer 的 key
#                               token_indexers={'tokens': token_indexers})
# train_ds, test_ds = [reader.read(fname) for fname in ['train.json', 'test.json']]
reader = NewsGroupsDataSetReader(
    tokenizer=tokenizer,  # TODO: token_indexer 的 key
    token_indexers={'tokens': token_indexers})
train_ds, test_ds = [reader.read(fname) for fname in ['train', 'test']]
val_ds = None

voc = Vocabulary()

iterator = BucketIterator(batch_size=config.batch_size,
                          sorting_keys=[('sentence', 'num_tokens')])
iterator.index_with(vocab=voc)

# 2. 搭建模型

word_embeddings = get_embedder()

encoder = get_encoder(voc, word_embeddings.get_output_dim())

model = BaseModelWithoutKnowledge(voc=voc,
                                  word_embeddings=word_embeddings,
                                  encoder=encoder,
                                  out_sz=reader.label_length,
                                  multi=False)
model = model.cuda(cuda_device) if cuda_device > -1 else model
# 3. 训练
"""
############### Instantiate the model and optimizer ##################
"""

model = Ncut.NameCountryModel(cf_a, vocab)
optimizer = optim.SGD(model.parameters(), lr=0.01)
cf_a.optimizer = optimizer

model.to(device = device, dtype = dtype)
"""
############ Iterator that will get the samples for the problem #############
"""
batch_size=10
batch_size_validation = 100

iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("text_field", "num_tokens")])
iterator.index_with(vocab)

iterator_validation = BucketIterator(batch_size = batch_size_validation, sorting_keys=[("text_field", "num_tokens")])
iterator_validation.index_with(vocab)

num_batches = int(np.floor(len(train_dataset)/batch_size))
num_batches_validation = int(np.floor(len(validation_dataset)/batch_size_validation))
	# Create the iterator over the data:
batches_iterable = iterator(train_dataset)
batches_iterable_validation = iterator_validation(validation_dataset)

"""
##############################################################################
######################### TRAINING #######################################
Probably should not use this one because we want more features for the Bayesian elements.
def tokenizer(x):
    return [w.text for w in SpacyWordSplitter(language='en_core_web_sm',pos_tags=False).split_words(x)[:config.max_seq_len]]

reader = JigsawDataReader(tokenizer=tokenizer,token_indexers={'tokens':token_indexers})

train_ds,test_ds = (reader.read(DATA_PATH+w) for w in ["train.csv", "test_proced.csv"])
val_ds = None

vars(train_ds[0].fields['tokens'])

from allennlp.data.vocabulary import Vocabulary
vocab = Vocabulary.from_instances(train_ds,max_vocab_size = config.max_vocab_size)

from allennlp.data.iterators import BucketIterator
iterator = BucketIterator(batch_size=config.batch_size,sorting_keys=[('tokens','num_tokens')],)

iterator.index_with(vocab)

batch = next(iter(iterator(train_ds)))

from allennlp.modules.seq2vec_encoders import Seq2VecEncoder,PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model

class BaslinModel(Model):
    def __init__(self,word_embeddings,encoder,out_sz=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
Exemple #40
0
#### For embedding the tokens we'll just use the <code>BasicTextFieldEmbedder</code> which takes a mapping from index names to embeddings. If you go back to where we defined our <code>DatasetReader</code>, the default parameters included a single index called "tokens", so our mapping just needs an embedding corresponding to that index. We use the <code>Vocabulary</code> to find how many embeddings we need and our <code>EMBEDDING_DIM</code> parameter to specify the output dimension. It's also possible to start with pre-trained embeddings (for example, GloVe vectors), but there's no need to do that on this tiny toy dataset.
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
#### We next need to specify the sequence encoder. The need for <code>PytorchSeq2SeqWrapper</code> here is slightly unfortunate (and if you use <a href = "https://github.com/allenai/allennlp/blob/master/tutorials/tagger/README.md#using-config-files">configuration files</a> you won't need to worry about it) but here it's required to add some extra functionality (and a cleaner interface) to the built-in PyTorch module. In AllenNLP we do everything batch first, so we specify that as well.
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

#### Finally, we can instantiate the model.
model = LstmTagger(word_embeddings, lstm, vocab)

#### Now we're ready to train the model. The first thing we'll need is an optimizer. We can just use PyTorch's stochastic gradient descent.
optimizer = optim.SGD(model.parameters(), lr=0.1)

#### And we need a <code>DataIterator</code> that handles batching for our datasets. The <code>BucketIterator</code> sorts instances by the specified fields in order to create batches with similar sequence lengths. Here we indicate that we want to sort the instances by the number of tokens in the sentence field.
iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])
#### We also specify that the iterator should make sure its instances are indexed using our vocabulary; that is, that their strings have been converted to integers using the mapping we previously created.
iterator.index_with(vocab)

#### Now we instantiate our <code>Trainer</code> and run it. Here we tell it to run for 1000 epochs and to stop training early if it ever spends 10 epochs without the validation metric improving. The default validation metric is loss (which improves by getting smaller), but it's also possible to specify a different metric and direction (e.g. accuracy should get bigger).
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000)

#### When we launch it it will print a progress bar for each epoch that includes both the "loss" and the "accuracy" metric. If our model is good, the loss should go down and the accuracy up as we train.
trainer.train()
Exemple #41
0
        word = vocab.get_token_from_index(i, 'tokens')
        if word in word_vector.vocab:
            pretrained_weight[vocab.get_token_index(word)] = word_vector[word]
    del word_vector

    token_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=args.embedding_size,
        weight=torch.from_numpy(pretrained_weight).float())
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    model = ATAE(args, word_embeddings, vocab)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           weight_decay=1e-5)
    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[("trigger_0", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=eval_dataset,
        num_epochs=args.epochs,
        patience=args.patience,  # stop training before loss raise
        cuda_device=args.cuda_device,  # cuda device id
    )

    # start train
    metrics = trainer.train()