Esempio n. 1
0
def evaluate(model, batches):

    mrr_value = []
    model.train(False)
    for iteration in range(len(batches)):

        batch = batches[iteration]
        batch_candidates = batch["candidates"]
        batch_answer_indices = batch['answer_indices']

        for index, query in enumerate(batch['queries']):

            # query tokens
            batch_query = variable(torch.LongTensor(query), volatile=True)
            batch_query_length = [batch['qlengths'][index]]
            batch_query_ner = variable(torch.LongTensor(batch['q_ner'][index]))
            batch_query_pos = variable(torch.LongTensor(batch['q_pos'][index]))

            #Sort the candidates by length
            batch_candidate_lengths = np.array(
                batch_candidates["anslengths"][index])
            candidate_sort = np.argsort(batch_candidate_lengths)[::-1].copy()
            batch_candidates_sorted = variable(torch.LongTensor(
                batch_candidates["answers"][index][candidate_sort, ...]),
                                               volatile=True)
            batch_candidate_lengths_sorted = batch_candidate_lengths[
                candidate_sort]
            batch_candidate_ner_sorted = variable(
                torch.LongTensor(batch_candidates['ner'][index][candidate_sort,
                                                                ...]))
            batch_candidate_pos_sorted = variable(
                torch.LongTensor(batch_candidates['pos'][index][candidate_sort,
                                                                ...]))

            batch_len = len(batch_candidate_lengths_sorted)
            batch_candidate_unsort = variable(torch.LongTensor(
                np.argsort(candidate_sort)),
                                              volatile=True)
            batch_metrics = variable(torch.FloatTensor(
                batch['metrics'][index]),
                                     volatile=True)
            indices = model.eval(
                batch_query, batch_query_ner, batch_query_pos,
                batch_query_length, batch_candidates_sorted,
                batch_candidate_ner_sorted, batch_candidate_pos_sorted,
                batch_candidate_lengths_sorted, batch_candidate_unsort,
                batch_answer_indices[index], batch_metrics, batch_len)

            mrr_value.append(computeMRR(indices, batch_answer_indices, index))

    mean_rr = np.mean(mrr_value)
    print("MRR :{0}".format(mean_rr))
    model.train(True)
    return mean_rr
Esempio n. 2
0
def main(args):

    # Set random seeds for reproducibility
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    with open(args.train_path, "rb") as file:
        train_summaries = pickle.load(file, encoding='utf-8')

    # with open(args.valid_path, "rb") as file:
    #     valid_summaries = pickle.load(file, encoding='utf-8')

    for summary in train_summaries:
        convert_document(summary)

    # for summary in valid_summaries:
    #     convert_document(summary)

    elmo_instance = Elmo(options_url, weights_url, 1)
    if use_cuda:
        elmo_instance.cuda()

    begin = timer()
    total_answers = 0
    for summary in train_summaries[:]:
        answers = [[elmo_tokenize(word) for word in answer] * 2
                   for answer in summary.answers]
        answers = pad_elmo(answers)
        answers = answers * 2
        batch = variable(torch.LongTensor(answers))
        a = elmo_instance(batch)
        total_answers += len(answers)

    end = timer()
    print("Total time elapsed: {}".format(end - begin))
    print("Time per thousand answers: {}".format(
        (end - begin) * 1000 / total_answers))
Esempio n. 3
0
def train_epochs(model, vocab):
    clip_threshold = args.clip_threshold
    eval_interval = args.eval_interval

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    train_loss = 0
    train_denom = 0
    validation_history = []
    bad_counter = 0
    best_mrr = -1.0

    patience = 30

    valid_batches = create_batches(valid_documents, args.batch_length,
                                   args.job_size, vocab)
    #train_batches = create_batches(train_documents, args.batch_length, args.job_size, vocab)
    #train_batch_for_validation = get_random_batch_from_training(train_batches, len(valid_batches))
    test_batches = create_batches(test_documents, args.batch_length,
                                  args.job_size, vocab)
    mrr_value = []

    for epoch in range(args.num_epochs):

        print("Creating train batches")
        train_batches = create_batches(train_documents, args.batch_length,
                                       args.job_size, vocab)

        print("Starting epoch {}".format(epoch))

        saved = False
        for iteration in range(len(train_batches)):
            optimizer.zero_grad()
            if (iteration + 1) % eval_interval == 0:
                print("iteration {}".format(iteration + 1))
                print("train loss: {}".format(train_loss / train_denom))

                if iteration != 0:
                    average_rr = evaluate(model, valid_batches)
                    validation_history.append(average_rr)

                    mean_rr = np.mean(mrr_value)
                    print("Training MRR :{0}".format(mean_rr))
                    mrr_value = []

                    print("Validation: MRR:{0}".format(average_rr))

                    if (iteration + 1) % (eval_interval * 5) == 0:
                        if average_rr >= max(validation_history):
                            saved = True
                            print(
                                "Saving best model seen so far itr  number {0}"
                                .format(iteration))
                            torch.save(model, args.model_path)
                            #torch.save(model.state_dict(), args.model_path)
                            print("Best on Validation: MRR:{0}".format(
                                average_rr))
                            bad_counter = 0
                        else:
                            bad_counter += 1
                        if bad_counter > patience:
                            print("Early Stopping")
                            print("Testing started")
                            evaluate(model, test_batches)
                            exit(0)

            batch = train_batches[iteration]
            #view_batch(batch,loader.vocab)
            batch_query_lengths = batch['qlengths']
            batch_candidates = batch["candidates"]
            batch_answer_indices = batch['answer_indices']
            batch_size = len(batch_query_lengths)
            loss_total = variable(torch.zeros(batch_size))

            for index, query in enumerate(batch['queries']):

                # query tokens
                batch_query = variable(torch.LongTensor(query))
                batch_query_length = [batch['qlengths'][index]]
                batch_query_ner = variable(
                    torch.LongTensor(batch['q_ner'][index]))
                batch_query_pos = variable(
                    torch.LongTensor(batch['q_pos'][index]))

                #Sort the candidates by length
                batch_candidate_lengths = np.array(
                    batch_candidates["anslengths"][index])
                candidate_sort = np.argsort(
                    batch_candidate_lengths)[::-1].copy()
                batch_candidates_sorted = variable(
                    torch.LongTensor(
                        batch_candidates["answers"][index][candidate_sort,
                                                           ...]))
                batch_candidate_lengths_sorted = batch_candidate_lengths[
                    candidate_sort]
                batch_candidate_ner_sorted = variable(
                    torch.LongTensor(
                        batch_candidates['ner'][index][candidate_sort, ...]))
                batch_candidate_pos_sorted = variable(
                    torch.LongTensor(
                        batch_candidates['pos'][index][candidate_sort, ...]))

                batch_len = len(batch_candidate_lengths_sorted)
                batch_candidate_unsort = variable(
                    torch.LongTensor(np.argsort(candidate_sort)))
                batch_metrics = variable(
                    torch.FloatTensor(batch['metrics'][index]))

                gold_index = variable(
                    torch.LongTensor([batch_answer_indices[index]]))
                negative_indices = [idx for idx in range(batch_len)]
                negative_indices.pop(batch_answer_indices[index])
                negative_indices = variable(torch.LongTensor(negative_indices))

                loss, indices = model(
                    batch_query, batch_query_ner, batch_query_pos,
                    batch_query_length, batch_candidates_sorted,
                    batch_candidate_ner_sorted, batch_candidate_pos_sorted,
                    batch_candidate_lengths_sorted, batch_candidate_unsort,
                    gold_index, negative_indices, batch_metrics, batch_len)
                loss_total[index] = loss

                mrr_value.append(
                    computeMRR(indices, batch_answer_indices, index))

            mean_loss = torch.mean(loss_total, 0)
            mean_loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), clip_threshold)
            optimizer.step()

            if args.use_cuda:
                train_loss += mean_loss.data.cpu().numpy()[0] * batch_size

            else:
                train_loss += mean_loss.data.numpy()[0] * batch_size

            train_denom += batch_size

        if not saved:
            print("Saving model after epoch {0}".format(epoch))
            torch.save(model, args.model_path + ".dummy")

    print("All epochs done")
    print("Testing started")
    if not saved:
        model = torch.load(args.model_path + ".dummy")
    else:
        model = torch.load(args.model_path)
    evaluate(model, test_batches)
Esempio n. 4
0
def evaluate(model,
             batches,
             candidates_embed_docid,
             context_per_docid,
             candidates_per_docid,
             fout=None):
    mrr_value = []
    model.train(False)
    for iteration in range(len(batches)):

        batch = batches[iteration]
        batch_doc_ids = batch['doc_ids']
        batch_q_tokens = batch['q_tokens']
        batch_candidates = batch["candidates"]
        batch_answer_indices = batch['answer_indices']
        batch_reduced_context_indices = batch['chunk_indices']
        for index, query_embed in enumerate(batch['q_embed']):

            fout.write("\nQ: {0}".format(" ".join(batch_q_tokens[index])))
            # query tokens
            batch_query = variable(torch.FloatTensor(query_embed),
                                   volatile=True)
            batch_query_length = np.array([batch['qlengths'][index]])
            batch_question_mask = variable(
                torch.FloatTensor(
                    np.array([1 for x in range(batch_query_length)])))

            # Sort the candidates by length
            batch_candidate_lengths = np.array(
                batch_candidates["anslengths"][index])
            batch_candidate_mask = np.array(batch_candidates['mask'][index])
            candidate_sort = np.argsort(batch_candidate_lengths)[::-1].copy()

            doc_id = batch_doc_ids[index]
            batch_candidates_embed_sorted = variable(
                torch.FloatTensor(
                    candidates_embed_docid[doc_id][candidate_sort, ...]))
            batch_candidate_lengths_sorted = batch_candidate_lengths[
                candidate_sort]
            batch_candidate_masks_sorted = variable(
                torch.FloatTensor(batch_candidate_mask[candidate_sort]))

            # context tokens
            ## if using reduced context
            if args.reduced:
                context_embeddings = context_per_docid[doc_id]
                reduced_context_embeddings = []
                ranges = batch_reduced_context_indices[index]
                for r in ranges:
                    reduced_context_embeddings += context_embeddings[
                        r[0]:r[1]].tolist()
                batch_context = variable(
                    torch.FloatTensor(reduced_context_embeddings))
            else:
                batch_context = variable(
                    torch.FloatTensor(context_per_docid[doc_id]))

            batch_context_length = np.array([batch_context.size(0)])
            batch_context_mask = variable(
                torch.FloatTensor(
                    np.array([1 for x in range(batch_context_length[0])])))

            batch_len = len(batch_candidate_lengths_sorted)
            batch_candidate_unsort = variable(torch.LongTensor(
                np.argsort(candidate_sort)),
                                              volatile=True)

            indices = model.eval(
                batch_query, batch_query_length, batch_question_mask,
                batch_context, batch_context_length, batch_context_mask,
                batch_candidates_embed_sorted, batch_candidate_lengths_sorted,
                batch_candidate_masks_sorted, batch_candidate_unsort)

            if args.use_cuda:
                indices = indices.data.cpu()

            else:
                indices = indices.data

            position_gold_sorted = (
                indices == batch_answer_indices[index]).nonzero().numpy()[0][0]
            gold_index = batch_answer_indices[index]
            index = position_gold_sorted + 1

            mrr_value.append(1.0 / (index))

            candidates = candidates_per_docid[doc_id]
            fout.write("\nRank: {0} / {1}   Gold: {2}\n".format(
                index, len(candidates), " ".join(
                    candidates[indices[position_gold_sorted].numpy()[0]])))
            for cand in range(10):
                fout.write("C: {0}\n".format(" ".join(
                    candidates[indices[cand].numpy()[0]])))

    mean_rr = np.mean(mrr_value)
    print("MRR :{0}".format(mean_rr))
    model.train(True)
    return mean_rr
Esempio n. 5
0
def train_epochs(model, vocab):

    fout = codecs.open(args.debug_file, "w", encoding='utf-8')
    clip_threshold = args.clip_threshold
    eval_interval = args.eval_interval

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    train_loss = 0
    train_denom = 0
    validation_history = []
    bad_counter = 0

    patience = 30

    valid_batches = create_batches(valid_documents, args.batch_length,
                                   args.job_size, vocab)
    test_batches = create_batches(test_documents, args.batch_length,
                                  args.job_size, vocab)

    mrr_value = []
    for epoch in range(args.num_epochs):

        print("Creating train batches")
        train_batches = make_bucket_batches(train_documents, args.batch_length,
                                            vocab)
        print("Starting epoch {}".format(epoch))
        fout.write("==========Epoch {0}=========\n".format(epoch))

        saved = False
        for iteration in range(len(train_batches)):
            optimizer.zero_grad()
            if (iteration + 1) % eval_interval == 0:
                print("iteration: {0} train loss: {1}".format(
                    iteration + 1, train_loss / train_denom))

                if iteration != 0:
                    average_rr = evaluate(model, valid_batches,
                                          valid_candidates_embed_docid,
                                          valid_context_per_docid,
                                          valid_candidate_per_docid, fout)
                    validation_history.append(average_rr)
                    train_average_rr = np.mean(mrr_value)
                    if (iteration + 1) % (eval_interval) == 0:
                        print("Train MRR:{0}  Validation MRR:{1}".format(
                            train_average_rr, average_rr))

                        mrr_value = []
                        if average_rr >= max(validation_history):
                            saved = True
                            print(
                                "Saving best model seen so far itr number {0}".
                                format(iteration))
                            torch.save(model, args.model_path)
                            print("Best on Validation: MRR:{0}".format(
                                average_rr))
                            bad_counter = 0
                        else:
                            bad_counter += 1
                        if bad_counter > patience:
                            print("Early Stopping")
                            print("Testing started")
                            model = torch.load(args.model_path)
                            evaluate(model, test_batches,
                                     test_candidates_embed_docid,
                                     test_context_per_docid,
                                     test_candidate_per_docid, None)
                            exit(0)

            batch = train_batches[iteration]
            # view_batch(batch,loader.vocab)
            batch_query_lengths = batch['qlengths']
            batch_candidates = batch["candidates"]
            batch_doc_ids = batch['doc_ids']
            batch_reduced_context_indices = batch['chunk_indices']
            batch_answer_indices = batch['answer_indices']
            batch_size = len(batch_query_lengths)
            losses = variable(torch.zeros(batch_size))
            for index, query_embed in enumerate(batch['q_embed']):
                # query tokens
                batch_query = variable(torch.FloatTensor(query_embed))
                batch_query_length = np.array([batch['qlengths'][index]])
                batch_question_mask = variable(
                    torch.FloatTensor(
                        np.array([1 for x in range(batch_query_length)])))

                # Sort the candidates by length (only required if using an RNN)
                batch_candidate_lengths = np.array(
                    batch_candidates["anslengths"][index])
                batch_candidate_mask = np.array(
                    batch_candidates['mask'][index])
                candidate_sort = np.argsort(
                    batch_candidate_lengths)[::-1].copy()

                # get candidates_embed from doc_id
                doc_id = batch_doc_ids[index]
                batch_candidates_embed_sorted = variable(
                    torch.FloatTensor(
                        train_candidates_embed_docid[doc_id][candidate_sort,
                                                             ...]))

                batch_candidate_lengths_sorted = batch_candidate_lengths[
                    candidate_sort]
                batch_candidate_unsort = variable(
                    torch.LongTensor(np.argsort(candidate_sort)))
                batch_candidate_masks_sorted = variable(
                    torch.FloatTensor(batch_candidate_mask[candidate_sort]))

                batch_len = len(batch_candidate_lengths)

                # context tokens
                ## if using reduced context
                if args.reduced:
                    context_embeddings = train_context_per_docid[doc_id]
                    reduced_context_embeddings = []
                    ranges = batch_reduced_context_indices[index]
                    for r in ranges:
                        reduced_context_embeddings += context_embeddings[
                            r[0]:r[1]].tolist()
                    batch_context = variable(
                        torch.FloatTensor(reduced_context_embeddings))
                else:
                    batch_context = variable(
                        torch.FloatTensor(train_context_per_docid[doc_id]))

                batch_context_length = np.array([batch_context.size(0)])
                batch_context_mask = variable(
                    torch.FloatTensor(
                        np.array([1 for x in range(batch_context_length[0])])))

                gold_index = variable(
                    torch.LongTensor([batch_answer_indices[index]]))
                negative_indices = [idx for idx in range(batch_len)]
                negative_indices.pop(batch_answer_indices[index])
                negative_indices = variable(torch.LongTensor(negative_indices))

                loss, indices = model(batch_query, batch_query_length,
                                      batch_question_mask, batch_context,
                                      batch_context_length, batch_context_mask,
                                      batch_candidates_embed_sorted,
                                      batch_candidate_lengths_sorted,
                                      batch_candidate_masks_sorted,
                                      batch_candidate_unsort, gold_index,
                                      negative_indices)

                losses[index] = loss
                mrr_value.append(
                    train_mrr(index, indices, batch_answer_indices))

            # loss.backward()
            # torch.nn.utils.clip_grad_norm(model.parameters(), clip_threshold)
            # optimizer.step()

            mean_loss = losses.mean(0)
            mean_loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), clip_threshold)
            optimizer.step()
            if args.use_cuda:
                train_loss += mean_loss.data.cpu().numpy()[0] * batch_size

            else:
                train_loss += mean_loss.data.numpy()[0] * batch_size

            train_denom += batch_size

        if not saved:
            print("Saving model after epoch {0}".format(epoch))
            torch.save(model, args.model_path + ".dummy")

    print("All epochs done")
    model = torch.load(args.model_path)
    evaluate(model, test_batches, test_candidates_embed_docid,
             test_context_per_docid)
Esempio n. 6
0
def train_epochs(model, vocab):
    clip_threshold = args.clip_threshold
    eval_interval = args.eval_interval

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    train_loss = 0
    train_denom = 0
    validation_history = []
    bad_counter = 0
    best_mrr = -1.0

    patience = 30

    valid_batches = create_batches(valid_documents, args.batch_length,
                                   args.job_size, vocab)[:200]
    test_batches = create_batches(test_documents, args.batch_length,
                                  args.job_size, vocab)

    for epoch in range(args.num_epochs):

        print("Creating train batches")
        train_batches = create_batches(train_documents, args.batch_length,
                                       args.job_size, vocab)
        print("Starting epoch {}".format(epoch))

        saved = False
        for iteration in range(len(train_batches)):
            optimizer.zero_grad()
            if (iteration + 1) % eval_interval == 0:
                print("iteration {}".format(iteration + 1))
                print("train loss: {}".format(train_loss / train_denom))

                if iteration != 0:
                    average_rr = evaluate(model, valid_batches)
                    validation_history.append(average_rr)

                    if (iteration + 1) % (eval_interval * 5) == 0:
                        if average_rr >= max(validation_history):
                            saved = True
                            print(
                                "Saving best model seen so far itr number {0}".
                                format(iteration))
                            torch.save(model, args.model_path)
                            print("Best on Validation: MRR:{0}".format(
                                average_rr))
                            bad_counter = 0
                        else:
                            bad_counter += 1
                        if bad_counter > patience:
                            print("Early Stopping")
                            print("Testing started")
                            evaluate(model, valid_batches)
                            exit(0)

            batch = train_batches[iteration]
            # view_batch(batch,loader.vocab)
            batch_query_lengths = batch['qlengths']

            batch_answer_indices = batch['answer_indices']
            batch_size = len(batch_query_lengths)

            batch_query = variable(torch.LongTensor(batch['queries']))
            batch_query_length = np.array([batch['qlengths']])
            batch_question_mask = variable(torch.FloatTensor(batch['q_mask']))

            # Sort the candidates by length (only required if using an RNN)

            # context tokens
            batch_context = variable(torch.LongTensor(
                batch['contexts'][index]))
            batch_context_length = np.array([batch['clengths'][index]])
            batch_context_mask = variable(
                torch.FloatTensor(batch['context_mask'][index]))

        if not saved:
            print("Saving model after epoch {0}".format(epoch))
            torch.save(model, args.model_path + ".dummy")

    print("All epochs done")
Esempio n. 7
0
def evaluate(model, batches):
    mrr_value = []
    model.train(False)
    for iteration in range(len(batches)):

        batch = batches[iteration]
        batch_candidates = batch["candidates"]
        batch_answer_indices = batch['answer_indices']

        for index, query in enumerate(batch['queries']):

            # query tokens
            batch_query = variable(torch.LongTensor(query), volatile=True)
            batch_query_length = [batch['qlengths'][index]]
            batch_question_mask = variable(
                torch.FloatTensor(batch['q_mask'][index]))
            # batch_query_ner = variable(torch.LongTensor(batch['q_ner'][index]))
            # batch_query_pos = variable(torch.LongTensor(batch['q_pos'][index]))

            # Sort the candidates by length
            batch_candidate_lengths = np.array(
                batch_candidates["anslengths"][index])
            batch_candidate_mask = np.array(batch_candidates['mask'][index])
            candidate_sort = np.argsort(batch_candidate_lengths)[::-1].copy()
            batch_candidates_sorted = variable(torch.LongTensor(
                batch_candidates["answers"][index][candidate_sort, ...]),
                                               volatile=True)
            batch_candidate_lengths_sorted = batch_candidate_lengths[
                candidate_sort]
            batch_candidate_masks_sorted = variable(
                torch.FloatTensor(batch_candidate_mask[candidate_sort]))
            # batch_candidate_ner_sorted = variable(torch.LongTensor(batch_candidates['ner'][index][candidate_sort, ...]))
            # batch_candidate_pos_sorted = variable(
            # 	torch.LongTensor(batch_candidates['pos'][index][candidate_sort, ...]))

            # context tokens
            batch_context = variable(torch.LongTensor(
                batch['contexts'][index]))
            batch_context_length = np.array([batch['clengths'][index]])
            batch_context_mask = variable(
                torch.FloatTensor(batch['context_mask'][index]))

            batch_len = len(batch_candidate_lengths_sorted)
            batch_candidate_unsort = variable(torch.LongTensor(
                np.argsort(candidate_sort)),
                                              volatile=True)

            indices = model.eval(
                batch_query, batch_query_length, batch_question_mask,
                batch_context, batch_context_length, batch_context_mask,
                batch_candidates_sorted, batch_candidate_lengths_sorted,
                batch_candidate_masks_sorted, batch_candidate_unsort)

            if args.use_cuda:
                indices = indices.data.cpu()

            else:
                indices = indices.data

            position_gold_sorted = (
                indices == batch_answer_indices[index]).nonzero().numpy()[0][0]

            index = position_gold_sorted + 1

            mrr_value.append(1.0 / (index))

    mean_rr = np.mean(mrr_value)
    print("MRR :{0}".format(mean_rr))
    model.train(True)
    return mean_rr