def show_open_paragraphs(start: int, end: int):
    splitter = MergeParagraphs(400)
    stop = NltkPlusStopWords(True)
    ranker = ShallowOpenWebRanker(6)
    stop_words = stop.words

    print("Loading train")
    corpus = TriviaQaOpenDataset()
    train = corpus.get_dev()
    np.random.shuffle(train)

    for q in train:
        q_words = {strip_accents_unicode(w.lower()) for w in q.question}
        q_words = {x for x in q_words if x not in stop_words}

        para = []
        for d in q.all_docs:
            doc = corpus.evidence.get_document(d.doc_id)
            para += splitter.split_annotated(doc, d.answer_spans)

        ranked = ranker.prune(q.question, para)
        if len(ranked) < start:
            continue
        ranked = ranked[start:end]

        print(" ".join(q.question))
        print(q.answer.all_answers)
        for i in range(start, end):
            para = ranked[i]
            text = flatten_iterable(para.text)
            print("Start=%d, Rank=%d" % (para.start, i))
            if len(para.answer_spans) == 0:
                # print("No Answer!")
                continue
            for s, e in para.answer_spans:
                text[s] = bcolors.CYAN + text[s]
                text[e] = text[e] + bcolors.ENDC
            for i, w in enumerate(text):
                if strip_accents_unicode(w.lower()) in q_words:
                    text[i] = bcolors.ERROR + text[i] + bcolors.ENDC
            print(" ".join(text))
        input()
def main():
    data = TriviaQaOpenDataset()
    # data = TriviaQaWebDataset()
    print("Loading...")
    all_questions = data.get_dev()

    questions = [
        q for q in all_questions if any(
            len(x.answer_spans) > 0 for x in q.all_docs)
    ]
    print(
        "%d/%d (%.4f) have an answer" % (len(questions), len(all_questions),
                                         len(questions) / len(all_questions)))

    np.random.shuffle(questions)

    pre = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400),
                                            TopTfIdf(NltkPlusStopWords(), 20),
                                            require_an_answer=False)
    print("Done")

    out = preprocess_par(questions[:2000], data.evidence, pre, 2, 1000)

    n_counts = np.zeros(20)
    n_any = np.zeros(20)
    n_any_all = np.zeros(20)

    for q in out.data:
        for i, p in enumerate(q.paragraphs):
            n_counts[i] += 1
            n_any[i] += len(p.answer_spans) > 0

        for i, p in enumerate(q.paragraphs):
            if len(p.answer_spans) > 0:
                n_any_all[i:] += 1
                break

    print(n_any_all / out.true_len)
    print(n_any / n_counts)
    print(n_counts)
Example #3
0
    model_name += '___SF' + str(args.sample_first)

if args.sample_rest != 1.0:
    model_name += '___SR' + str(args.sample_rest)

all_train_questions = []
all_dev_questions = []
all_filemaps = {}
for ind, dataset_name in enumerate(args.datasets.split(',')):
    print('loading ' + dataset_name)
    source_dir = join(CORPUS_DIR, "triviaqa", "web-open", dataset_name)

    dataset = TriviaQaOpenDataset(source_dir)
    # just loading the pkl that was saved in build_span_corpus
    if args.sample_first == 1.0 or ind == 0:
        all_dev_questions += dataset.get_dev()

    num_of_contexts = (pd.Series(args.datasets.replace('-G','').replace('-O','').split(',')) == \
                       dataset_name.replace('-G','').replace('-O','')).sum()

    train = dataset.get_train()

    # Filtering cases with no answer:
    train_with_ans = []
    for question in train:
        if pd.Series([len(doc.answer_spans)
                      for doc in question.all_docs]).sum() > 0:
            train_with_ans.append(question)

    print("number of question with answer is %d" % (len(train_with_ans)))
Example #4
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('answers', help='answer file')
    parser.add_argument('question_source')
    args = parser.parse_args()

    print("Loading answers..")
    answer_df = pd.read_csv(args.answers)

    print("Loading questions..")
    if args.question_source == "open":
        corpus = TriviaQaOpenDataset()
        questions = {q.question_id: q for q in corpus.get_dev()}
    elif args.question_source == "web":
        corpus = TriviaQaWebDataset()
        questions = {}
        for q in corpus.get_dev():
            for d in q.all_docs:
                questions[(q.question_id, d.doc_id)] = q
    elif args.question_source == "squad":
        show_squad_errors(args.answers)
        return
    else:
        raise ValueError()

    pre = WithIndicators()

    answer_df.sort_values(["question_id", "rank"], inplace=True)

    if args.question_source == "open":
        iter = answer_df.groupby(["question_id"])
    else:
        iter = answer_df.groupby(["question_id", "doc_id"])

    grouped = list(iter)
    np.random.shuffle(grouped)

    for key, group in grouped:
        print(list(questions.keys())[:10])
        q = questions[key]
        cur_best_score = group.text_f1.iloc[0]
        cur_best_conf = group.predicted_score.iloc[0]
        cur_best_ix = group.index[0]
        for i in range(1, len(group)):
            ix = group.index[i]
            conf = group.predicted_score[ix]
            if conf > cur_best_conf:
                score = group.text_f1[ix]
                if score < cur_best_score:
                    # We hurt our selves!
                    print("Oh no!")
                    print(" ".join(q.question))
                    print(q.answer.all_answers)
                    print("Best score was %.4f (conf=%.4f), but not is %.4f (conf=%.4f)" % (
                        cur_best_score, cur_best_conf, score, conf
                    ))
                    d1 = [d for d in q.all_docs if d.doc_id == group.doc_id[cur_best_ix]][0]
                    p1 = extract_paragraph(corpus.evidence.get_document(d1.doc_id), group.para_start[cur_best_ix], group.para_end[cur_best_ix])
                    s, e = group.para_start[cur_best_ix], group.para_end[cur_best_ix]
                    answers = d1.answer_spans[np.logical_and(d1.answer_spans[:, 0] >= s, d1.answer_spans[:, 1] < s)] - s
                    p1 = pre.encode_extracted_paragraph(q.question, ExtractedParagraphWithAnswers(
                        p1,  group.para_start[cur_best_ix], group.para_end[cur_best_ix], answers))

                    d2 = [d for d in q.all_docs if d.doc_id == group.doc_id[ix]][0]
                    p2 = extract_paragraph(corpus.evidence.get_document(d2.doc_id), group.para_start[ix], group.para_end[ix])
                    s, e = group.para_start[ix], group.para_end[ix]
                    answers = d2.answer_spans[np.logical_and(d2.answer_spans[:, 0] >= s, d2.answer_spans[:, 1] < s)] - s
                    p2 = pre.encode_extracted_paragraph(q.question, ExtractedParagraphWithAnswers(
                        p2,  group.para_start[ix], group.para_end[ix], answers))

                    p1_s, p1_e = group.predicted_start[cur_best_ix], group.predicted_end[cur_best_ix]
                    p2_s, p2_e = group.predicted_start[ix], group.predicted_end[ix]
                    print(" ".join(display_para(p1.text, p1.answer_spans, q.question, p1_s, p1_e)))
                    print()
                    print(" ".join(display_para(p2.text, p2.answer_spans, q.question, p2_s, p2_e)))
                    input()
                else:
                    cur_best_score = score
                    cur_best_ix = ix
                    cur_best_conf = conf