Ejemplo n.º 1
0
def get_decomposed(orig_question, prediction, prediction2, is_bridge,
                   with_key):
    while '  ' in orig_question:
        orig_question = orig_question.replace('  ', ' ')
    if is_bridge:
        question1 = prediction2 if with_key else prediction
        question2 = orig_question.replace(prediction, '[ANSWER]')
        assert '[ANSWER]' in question2
        for token in [', [ANSWER]', '[ANSWER] ,', '[ANSWER] who', \
                        '[ANSWER] when', '[ANSWER] where', '[ANSWER] which', \
                        '[ANSWER] that', '[ANSWER] whose']:
            if token in question2:
                if token == '[ANSWER] whose':
                    question = question2.replace(token, " [ANSWER] 's ")
                else:
                    question2 = question2.replace(token, ' [ANSWER] ')
    else:
        orig_question_tokens = orig_question.split(' ')
        prediction_tokens = prediction.split(' ')
        start, end = None, None
        for i in range(len(orig_question_tokens) - len(prediction_tokens) + 1):
            if orig_question_tokens[i:i + len(prediction_tokens
                                              )] == prediction_tokens:
                start, end = i, i + len(prediction_tokens)
                break
        if start is None and end is None:
            for i in range(
                    len(orig_question_tokens) - len(prediction_tokens) + 1):
                text = ' '.join(orig_question_tokens[i:i +
                                                     len(prediction_tokens)])
                if normalize_answer(text) == normalize_answer(prediction):
                    start, end = i, i + len(prediction_tokens)
                    break
        if start is None and end is None:
            for i in range(
                    len(orig_question_tokens) - len(prediction_tokens) + 1):
                text = ' '.join(orig_question_tokens[i:i +
                                                     len(prediction_tokens)])
                if normalize_answer(text).startswith(
                        normalize_answer(prediction)):
                    start, end = i, len(orig_question_tokens)
                    print("==== to long question ====")
                    print(' '.join(orig_question_tokens))
                    print(' '.join(orig_question_tokens[start:end]))
                    break

        assert start is not None and end is not None
        question1, question2 = intersection_convert_to_queries(
            orig_question_tokens, start, end - 1)
        question1, question2 = ' '.join(question1), ' '.join(question2)

    return orig_question, question1, question2
Ejemplo n.º 2
0
def evaluate_question_detector(questions: List[HotpotQuestion], word_tokenize, detector,
                               reference_detector=None, compute_f1s=False):
    """ Just for debugging """
    n_no_docs = 0
    answer_per_q = []
    answer_f1s = []

    for question_ix, q in enumerate(tqdm(questions)):
        if q.answer in {'yes', 'no'} and q.q_type == 'comparison':
            continue
        tokenized_aliases = [word_tokenize(q.answer)]
        detector.set_question(tokenized_aliases)

        output = []
        for i, par in enumerate(q.supporting_facts):

            for s ,e in detector.any_found(par.sentences):
                output.append((i, s, e))

            if len(output) == 0 and reference_detector is not None:
                if reference_detector is not None:
                    reference_detector.set_question(tokenized_aliases)
                    detected = []
                    for j, par in enumerate(q.supporting_facts):
                        for s, e in reference_detector.any_found(par.sentences):
                            detected.append((j, s, e))

                    if len(detected) > 0:
                        print("Found a difference")
                        print(q.answer.normalized_aliases)
                        print(tokenized_aliases)
                        for p, s, e in detected:
                            token = flatten_iterable(q.supporting_facts[p].sentences)[s:e]
                            print(token)

        answer_per_q.append(output)

        if compute_f1s:
            f1s = []
            for p, s, e in output:
                token = flatten_iterable(q.supporting_facts[p].sentences)[s:e]
                answer = normalize_answer(" ".join(token))
                f1, _, _ = f1_score(answer, normalize_answer(q.answer))
                f1s.append(f1)
            answer_f1s.append(f1s)

    n_answers = sum(len(x) for x in answer_per_q)
    print("Found %d answers (av %.4f)" % (n_answers, n_answers / len(answer_per_q)))
    print("%.4f docs have answers" % np.mean([len(x) > 0 for x in answer_per_q]))
    if len(answer_f1s) > 0:
        print("Average f1 is %.4f" % np.mean(flatten_iterable(answer_f1s)))
Ejemplo n.º 3
0
def _normalize_answer(text):
    if '<title>' in text:
        text = text.replace('<title>', '')
    if '</title>' in text:
        text = text.replace('</title>', '')

    list1 = ['/title>'[i:] for i in range(len('/title>'))]
    list2 = ['</title>'[:-i] for i in range(1, len('</title>'))] + \
                    ['<title>'[:-i] for i in range(1, len('<title>'))]

    for prefix in list1:
        if text.startswith(prefix):
            text = text[len(prefix):]

    for prefix in list2:
        if text.endswith(prefix):
            text = text[:-len(prefix)]

    if '(' in text and ')' not in text:
        texts = [t.strip() for t in text.split('(')]
        text = texts[np.argmax([len(t) for t in texts])]
    if ')' in text and '(' not in text:
        texts = [t.strip() for t in text.split(')')]
        text = texts[np.argmax([len(t) for t in texts])]

    text = normalize_answer(text)
    return text
Ejemplo n.º 4
0
def run(json_file_name, answer_file_name, eval_file_name):
    import json
    with open(json_file_name) as f:
        data = json.load(f)

    from qa.my_main import DecompRC
    model = DecompRC(batch_size=50)
    for d in data:
        id = d['_id']
        a = normalize_answer(d['answer'])
        q = d['question']
        p = d['context']
        (q1_b, q2_b), (q1_i, q2_i) = model.get_output("span-predictor", q, p)
        print("Q  : {}".format(q))
        print("A  : {}".format(a))
        print("Q1: {}".format(q1_b))
        first_answers = one_hop_answers(p, q1_b, 5)
        print("A1: {}".format([shit[0] for shit in first_answers]))
        next_question = q2_b.replace("[ANSWER]", first_answers[0][0])
        print("Q2: {}".format(next_question))
        second_answers = one_hop_answers(p, next_question, 5)
        print("A2: {}".format([shit[0] for shit in second_answers]))
        print("========================================")
        input()
Ejemplo n.º 5
0
def run(json_file_name, answer_file_name, eval_file_name):
    import json
    with open(json_file_name) as f:
        data = json.load(f)

    from qa.my_main import DecompRC
    model = DecompRC(batch_size=50)
    fscores = [0, 0, 0]
    ems = [0, 0, 0]
    precision = [0, 0, 0]
    recall = [0, 0, 0]
    SEETHISID = "5a81b2505542995ce29dcc32"
    FLAG = False
    for d in data:
        id = d['_id']
        if not FLAG:
            if SEETHISID == id:
                FLAG = True
            continue
        a = normalize_answer(d['answer'])
        q = d['question']
        p = d['context']
        if len(p) == 0:
            continue
        (q1_b, q2_b), (q1_i, q2_i) = model.get_output("span-predictor", q, p)
        print("Q  : {}".format(q))
        print("A  : {}".format(a))
        first_answer, _ = best_one_hop_answer(p, q)
        next_question = q2_b.replace("[ANSWER]", first_answer)
        bridge_answer, bridge_score = best_one_hop_answer(p, next_question)
        bridge_answer = normalize_answer(bridge_answer)
        print("A-B: {}".format(bridge_answer))

        common_answers = []
        k = 10
        while len(common_answers) == 0:
            first_answers = best_k_answers(p, q1_i, k)
            second_answers = best_k_answers(p, q2_i, k)
            second_answers_set = set([tup[0] for tup in second_answers])
            common_answers = [
                tup for tup in first_answers if tup[0] in second_answers_set
            ]
            k += 10
        intersec_answer = common_answers[0][0]
        intersec_score = common_answers[0][1]
        for ca in common_answers:
            if ca[1] > intersec_score:
                intersec_score = ca[1]
                intersec_answer = ca[0]
        intersec_answer = normalize_answer(intersec_answer)
        print("A-I: {}".format(intersec_answer))
        ultimate_answer = bridge_answer
        if intersec_score > bridge_score:
            ultimate_answer = intersec_answer
        print("A-C: {}".format(ultimate_answer))
        print("========================================")

        f1, prcsn, rcll = f1_score(bridge_answer, a)
        fscores[0] += f1
        precision[0] += prcsn
        recall[0] += rcll
        ems[0] += bridge_answer == a

        f1, prcsn, rcll = f1_score(intersec_answer, a)
        fscores[1] += f1
        precision[1] += prcsn
        recall[1] += rcll
        ems[1] += bridge_answer == a

        f1, prcsn, rcll = f1_score(ultimate_answer, a)
        fscores[2] += f1
        precision[2] += prcsn
        recall[2] += rcll
        ems[2] += bridge_answer == a

        with open(answer_file_name, mode='a') as file:
            row = [
                id, q, a, bridge_answer, bridge_score, intersec_answer,
                intersec_score, ultimate_answer
            ]
            writer = csv.writer(file)
            writer.writerow(row)

    N = len(data)
    fscores = [i / N for i in fscores]
    ems = [i / N for i in ems]
    precision = [i / N for i in fscores]
    recall = [i / N for i in ems]
    with open(eval_file_name, mode='a') as file:
        writer = csv.writer(file)
        writer.writerow(fscores)
        writer.writerow(precision)
        writer.writerow(recall)
        writer.writerow(ems)
Ejemplo n.º 6
0
def get_span_prediction(examples, features, result, with_keyword):

    prelim_predictions = []
    yn_predictions = []

    assert len(examples) == 1
    example = examples[0]

    feature = sorted(features, key=lambda f: f.unique_id)[0]
    gold_start_positions = feature.start_position
    gold_end_positions = feature.end_position
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction",
        ["start_index", "end_index", "keyword_index", "logit"])

    if len(result) != 1:
        from IPython import embed
        embed()
    result = result[0]

    switch = np.argmax(result.switch)
    if switch == 1:
        prelim_predictions.append(
            _PrelimPrediction(start_index=-1,
                              end_index=-1,
                              keyword_index=-1,
                              logit=result.switch[1]))
    elif switch == 0:
        scores = []
        start_logits = result.start_logits[:len(feature.tokens)]
        end_logits = result.end_logits[:len(feature.tokens)]
        if with_keyword:
            keyword_logits = result.keyword_logits[:len(feature.tokens)]
            for (i, s) in enumerate(start_logits):
                for (j, e) in enumerate(end_logits[i:]):
                    for (k, key) in enumerate(keyword_logits[i:i + j + 1]):
                        scores.append(((i, i + j, i + k), s + e + key))
        else:
            for (i, s) in enumerate(start_logits):
                for (j, e) in enumerate(end_logits[i:]):
                    scores.append(((i, i + j, i), s + e))

        scores = sorted(scores, key=lambda x: x[1], reverse=True)

        for (start_index, end_index, keyword_index), score in scores:
            if start_index >= len(feature.tokens):
                continue
            if end_index >= len(feature.tokens):
                continue
            if not (start_index <= keyword_index <= end_index):
                continue
            if start_index not in feature.token_to_orig_map or end_index not in feature.token_to_orig_map:
                continue
            if start_index - 1 in feature.token_to_orig_map and feature.token_to_orig_map[
                    start_index - 1] == feature.token_to_orig_map[start_index]:
                continue
            if end_index + 1 in feature.token_to_orig_map and feature.token_to_orig_map[
                    end_index + 1] == feature.token_to_orig_map[end_index]:
                continue
            if end_index < start_index:
                continue
            length = end_index - start_index
            if length <= 2:
                continue
            prelim_predictions.append(
                _PrelimPrediction(start_index=start_index,
                                  end_index=end_index,
                                  keyword_index=keyword_index,
                                  logit=score))
    else:
        raise NotImplementedError()

    prelim_predictions = sorted(prelim_predictions,
                                key=lambda x: x.logit,
                                reverse=True)

    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "NbestPrediction", ["text", "text2", "logit"])

    seen_predictions = {}
    nbest = []

    def get_text(start_index, end_index, keyword_index):
        if start_index == end_index == -1:
            final_text = example.all_answers[-1]
        else:
            feature = features[0]

            tok_tokens = feature.tokens[start_index:(end_index + 1)]
            orig_doc_start = feature.token_to_orig_map[start_index]
            orig_doc_end = feature.token_to_orig_map[end_index]
            orig_doc_keyword = feature.token_to_orig_map[keyword_index]

            orig_tokens = feature.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
            orig_tokens2 = orig_tokens.copy()
            for i in range(orig_doc_keyword, orig_doc_keyword - 5, -1):
                if i - orig_doc_start < 0: break
                if orig_tokens[i - orig_doc_start] in ['the', 'a', 'an']:
                    orig_tokens2[i - orig_doc_start] = 'which'
                    assert orig_tokens[i - orig_doc_start] != 'which'
                    break

            tok_text = " ".join(tok_tokens)

            # De-tokenize WordPieces that have been split off.
            tok_text = tok_text.replace(" ##", "")
            tok_text = tok_text.replace("##", "")

            # Clean whitespace
            tok_text = tok_text.strip()
            tok_text = " ".join(tok_text.split())

            final_text = get_final_text(tok_text, " ".join(orig_tokens))
            final_text2 = get_final_text(tok_text, " ".join(orig_tokens2))

        return final_text, final_text2

    for pred in prelim_predictions:
        prediction, prediction2 = get_text(pred.start_index, pred.end_index,
                                           pred.keyword_index)
        orig_question = ' '.join(example.doc_tokens)

        if with_keyword:
            question1 = prediction2 if with_keyword else prediction
            question2 = orig_question.replace(prediction, '[ANSWER]')
            assert '[ANSWER]' in question2
            for token in [', [ANSWER]', '[ANSWER] ,', '[ANSWER] who', \
                          '[ANSWER] when', '[ANSWER] where', '[ANSWER] which', \
                          '[ANSWER] that', '[ANSWER] whose']:
                if token in question2:
                    if token == '[ANSWER] whose':
                        question = question2.replace(token, " [ANSWER] 's ")
                    else:
                        question2 = question2.replace(token, ' [ANSWER] ')
        else:
            orig_question_tokens = orig_question.split(' ')
            prediction_tokens = prediction.split(' ')
            start, end = None, None
            for i in range(
                    len(orig_question_tokens) - len(prediction_tokens) + 1):
                if orig_question_tokens[i:i + len(prediction_tokens
                                                  )] == prediction_tokens:
                    start, end = i, i + len(prediction_tokens)
                    break
            if start is None and end is None:
                for i in range(
                        len(orig_question_tokens) - len(prediction_tokens) +
                        1):
                    text = ' '.join(
                        orig_question_tokens[i:i + len(prediction_tokens)])
                    if normalize_answer(text) == normalize_answer(prediction):
                        start, end = i, i + len(prediction_tokens)
                        break
            if start is None and end is None:
                for i in range(
                        len(orig_question_tokens) - len(prediction_tokens) +
                        1):
                    text = ' '.join(
                        orig_question_tokens[i:i + len(prediction_tokens)])
                    if normalize_answer(text).startswith(
                            normalize_answer(prediction)):
                        start, end = i, len(orig_question_tokens)
                        print("==== to long question ====")
                        print(' '.join(orig_question_tokens))
                        print(' '.join(orig_question_tokens[start:end]))
                        break

            try:
                assert start is not None and end is not None
            except Exception:
                print(orig_question)
                print(prediction)
            try:
                question1, question2 = intersection_convert_to_queries(
                    orig_question_tokens, start, end - 1)
            except Exception:
                embed()
                assert False
            question1, question2 = ' '.join(question1), ' '.join(question2)

        def postprocess(question):
            question = question.strip()
            while '  ' in question:
                question = question.replace('  ', ' ')
            if not question.endswith('?'):
                question += '?'
            while question.replace(' ', '').endswith('??'):
                question = question[:-1]
            return question

        return postprocess(question1), postprocess(question2)