def inference_build(item, cursor, contain_first_sentence=False):
    doc_t_list = [it[0] for it in item['prioritized_docids']]
    # evidence_group = check_sentences.check_and_clean_evidence(item)
    t_claim = ' '.join(item['claim_tokens'])
    eid = item['id']

    b_list = []
    for doc_id in doc_t_list:
        if '-LRB-' in doc_id and common.doc_id_to_tokenized_text(doc_id) not in t_claim:
            item = dict()
            item['selection_id'] = str(eid) + '###' + str(doc_id)
            example = common.doc_id_to_tokenized_text(doc_id)
            description_sent = ''
            if contain_first_sentence:
                r_list, id_list = fever_db.get_all_sent_by_doc_id(cursor, doc_id, with_h_links=False)
                for sent, sent_id in zip(r_list, id_list):
                    if int(sent_id.split('(-.-)')[1]) == 0:
                        description_sent = sent

            item['query'] = example + ' ' + description_sent
            item['text'] = t_claim
            item['selection_label'] = 'hidden'

            b_list.append(item)

    return b_list
Esempio n. 2
0
def make_examples(eid,
                  positive_list,
                  negative_list,
                  t_claim,
                  cursor,
                  contain_first_sentence=False):
    pos_examples = []
    neg_examples = []

    for pos_e in positive_list:
        item = dict()
        item['selection_id'] = str(eid) + '###' + str(pos_e)
        example = common.doc_id_to_tokenized_text(pos_e)
        description_sent = ''
        if contain_first_sentence:
            r_list, id_list = fever_db.get_all_sent_by_doc_id(
                cursor, pos_e, with_h_links=False)
            for sent, sent_id in zip(r_list, id_list):
                if int(sent_id.split('(-.-)')[1]) == 0:
                    description_sent = sent

        item['query'] = example + ' ' + description_sent
        item['text'] = t_claim
        item['selection_label'] = 'true'
        pos_examples.append(item)
        del item

    for neg_e in negative_list:
        # sampling
        item = dict()
        item['selection_id'] = str(eid) + '###' + str(neg_e)
        example = common.doc_id_to_tokenized_text(neg_e)
        description_sent = ''
        if contain_first_sentence:
            r_list, id_list = fever_db.get_all_sent_by_doc_id(
                cursor, neg_e, with_h_links=False)
            for sent, sent_id in zip(r_list, id_list):
                if int(sent_id.split('(-.-)')[1]) == 0:
                    description_sent = sent

        item['query'] = example + ' ' + description_sent
        # print(item['query'])
        item['text'] = t_claim
        item['selection_label'] = 'false'
        neg_examples.append(item)
        del item

    return pos_examples, neg_examples
Esempio n. 3
0
def evidence_list_to_text(cursor,
                          evidences,
                          contain_head=True,
                          id_tokenized=False):
    current_evidence_text = []
    evidences = sorted(evidences, key=lambda x: (x[0], x[1]))

    cur_head = 'DO NOT INCLUDE THIS FLAG'

    for doc_id, line_num in evidences:

        _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

        if contain_head and cur_head != doc_id:
            cur_head = doc_id

            if not id_tokenized:
                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))
            else:
                t_doc_id_natural_format = common.doc_id_to_tokenized_text(
                    doc_id)

            if line_num != 0:
                current_evidence_text.append(f"{t_doc_id_natural_format} <t>")

        # Important change move one line below: July 16
        current_evidence_text.append(e_text)

    # print(current_evidence_text)

    return ' '.join(current_evidence_text)
def enforce_disabuigation_into_retrieval_result_v2(disabuigation_r_list, r_list, prob_sh=0.5):
    # Index by id and doc_id
    disabuigation_dict = dict()
    for item in disabuigation_r_list:
        disabuigation_dict[item['selection_id']] = item

    for item in r_list:
        the_id = item['id']
        for i, (doc_id, priority) in enumerate(item['prioritized_docids']):
            if '-LRB-' in doc_id:  # Only use for disamb
                query_id = str(the_id) + '###' + doc_id
                if query_id in disabuigation_dict:
                    query_selection = disabuigation_dict[query_id]
                    item['prioritized_docids'][i] = [doc_id, query_selection['prob']]

        # Reset Exact match
        t_claim = ' '.join(item['claim_tokens'])
        item['predicted_docids'] = []
        for k, it in enumerate(item['prioritized_docids']):
            if '-LRB-' in it[0] and common.doc_id_to_tokenized_text(it[0]) in t_claim:
                item['prioritized_docids'][k] = [it[0], 5.0]
                if it[0] not in item['predicted_docids']:
                    item['predicted_docids'].append(it[0])

        for it in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0])):
            if it[0] not in item['predicted_docids'] and it[1] >= prob_sh:
                item['predicted_docids'].append(it[0])
Esempio n. 5
0
def evidence_list_to_text_list(cursor, evidences, contain_head=True):
    # One evidence one text and len(evidences) == len(text_list)
    current_evidence_text_list = []
    evidences = sorted(evidences, key=lambda x: (x[0], x[1]))

    cur_head = 'DO NOT INCLUDE THIS FLAG'

    for doc_id, line_num in evidences:

        _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

        cur_text = ""

        if contain_head and cur_head != doc_id:
            cur_head = doc_id

            t_doc_id_natural_format = common.doc_id_to_tokenized_text(doc_id)

            if line_num != 0:
                cur_text = f"{t_doc_id_natural_format} <t> "

        # Important change move one line below: July 16
        # current_evidence_text.append(e_text)
        cur_text = cur_text + e_text

        current_evidence_text_list.append(cur_text)

    assert len(evidences) == len(current_evidence_text_list)
    return current_evidence_text_list
def resample_answer_with_priority(d_list, top_k=5):
    count = 0
    num_of_select = Counter()
    print("Build results file...")
    for item in tqdm(d_list):

        # Important additional rule: We can add something more here to fine-select the document
        finded_keys = item['prioritized_docids']

        for i, (doc_id, priority) in enumerate(finded_keys):
            if priority == 1.0:
                # mem_parse_doc_id = memodict(partial(parse_doc_id, tokenizer=global_tok))
                doc_id_tokens = (
                    common.doc_id_to_tokenized_text(doc_id)).split(' ')

                if 'film' in get_words_inside_parenthese(doc_id_tokens):
                    finded_keys[i] = (doc_id, priority + 0.2)

                if 'album' in get_words_inside_parenthese(doc_id_tokens):
                    finded_keys[i] = (doc_id, priority + 0.1)

                    # print(doc_id_tokens)

                # doc_id_tokens, doc_id_lemmas = mem_parse_doc_id(doc_id)

                # match_score = check_inside_paretheses_overlap(doc_id_tokens, doc_id_lemmas, claim_tokens, claim_lemmas)
                # if match_score == 1.0:
                #     continue
                # else:
                #     finded_keys[i] = (doc_id, match_score)

        item['prioritized_docids'] = finded_keys

        item['predicted_docids'] = \
            list(set([k for k, v in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k]))
def convert_to_formatted_sent(zipped_s_id_list, evidence_set):
    sent_list = []
    for sent, sid in zipped_s_id_list:
        sent_item = dict()

        cur_sent = sent
        doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1])

        t_doc_id_natural_format = common.doc_id_to_tokenized_text(doc_id)

        if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower():
            cur_sent = f"{t_doc_id_natural_format} <t> " + sent

        sent_item['text'] = cur_sent
        sent_item['sid'] = doc_id + SENT_LINE + str(ln)
        # sid is '[doc_id]<SENT_LINE>[line_number]'
        if evidence_set is not None:
            if (doc_id, ln) in evidence_set:
                sent_item['selection_label'] = "true"
            else:
                sent_item['selection_label'] = "false"
        else:
            sent_item['selection_label'] = "hidden"

        sent_list.append(sent_item)

    return sent_list
def item_resorting(d_list):
    for item in d_list:
        t_claim = ' '.join(item['claim_tokens'])
        item['predicted_docids'] = []
        for it in item['prioritized_docids']:
            if '-LRB-' in it[0] and common.doc_id_to_tokenized_text(
                    it[0]) in t_claim:
                item['predicted_docids'].append(it[0])

        for it in sorted(item['prioritized_docids'],
                         key=lambda x: (-x[1], x[0])):
            if it[0] not in item['predicted_docids']:
                item['predicted_docids'].append(it[0])
Esempio n. 9
0
def disabuigation_training_build_v0(item,
                                    cursor,
                                    contain_first_sentence=False,
                                    only_found=True):
    doc_t_list = [it[0] for it in item['prioritized_docids']]
    evidence_group = check_sentences.check_and_clean_evidence(item)
    all_true_t_list = set()
    t_claim = ' '.join(item['claim_tokens'])
    for ground_truth_evid in evidence_group:
        # print(ground_truth_evid)
        true_t_list = set([it[0] for it in ground_truth_evid])
        all_true_t_list = set.union(all_true_t_list, true_t_list)
    all_true_t_list = list(all_true_t_list)

    positive_list = []
    negative_list = []
    eid = item['id']
    found_pos = False

    for doc_id in all_true_t_list:
        if '-LRB-' in doc_id and common.doc_id_to_tokenized_text(
                doc_id) not in t_claim:
            positive_list.append(doc_id)
            found_pos = True

    if found_pos and only_found:
        random.shuffle(doc_t_list)
        num_neg = random.randint(6, 8)

        # for _ in num_neg:
        for doc_id in doc_t_list[:num_neg]:
            if '-LRB-' in doc_id and doc_id not in all_true_t_list:
                negative_list.append(doc_id)

    elif not only_found:
        random.shuffle(doc_t_list)
        # Change this on Aug 30, 2018
        # num_neg = random.randint(36, 36)
        num_neg = random.randint(6, 8)

        # for _ in num_neg:
        for doc_id in doc_t_list[:num_neg]:
            if '-LRB-' in doc_id and doc_id not in all_true_t_list:
                negative_list.append(doc_id)

    return make_examples(eid,
                         positive_list,
                         negative_list,
                         t_claim,
                         cursor,
                         contain_first_sentence=contain_first_sentence)
Esempio n. 10
0
def load_keyword_dict_v1_3(in_filename, filtering=False):
    # COLON cleaned
    id_to_key_dict = dict()
    with open(in_filename, encoding='utf-8', mode='r') as in_f:
        for line in tqdm(in_f):
            item = json.loads(line.strip())
            if filtering and text_clean.filter_document_id(item['docid']):
                continue
            # id_to_key_dict[item['docid']] = item['keys']
            # This is a list of keys:
            id_to_key_dict[item['docid']] = [
                common.doc_id_to_tokenized_text(item['docid'])
            ]

    return id_to_key_dict
def filter_contain_parenthese_valid(item):
    doc_t_list = [it[0] for it in item['prioritized_docids']]
    evidence_group = check_sentences.check_and_clean_evidence(item)
    all_true_t_list = set()
    t_claim = ' '.join(item['claim_tokens'])
    for ground_truth_evid in evidence_group:
        # print(ground_truth_evid)
        true_t_list = set([it[0] for it in ground_truth_evid])
        all_true_t_list = set.union(all_true_t_list, true_t_list)
    all_true_t_list = list(all_true_t_list)
    for doc_id in all_true_t_list:
        if '-LRB-' in doc_id and doc_id in doc_t_list and common.doc_id_to_tokenized_text(
                doc_id) not in t_claim:
            return True

    return False
def item_resorting(d_list, top_k=None):
    for item in d_list:
        item['predicted_docids'] = []

        # Reset Exact match
        t_claim = ' '.join(item['claim_tokens'])
        item['predicted_docids'] = []
        for k, it in enumerate(item['prioritized_docids']):
            if '-LRB-' in it[0] and common.doc_id_to_tokenized_text(it[0]) in t_claim:
                item['prioritized_docids'][k] = [it[0], 5.0]
                item['predicted_docids'].append(it[0])

        for it in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0])):
            if it[0] not in item['predicted_docids']:
                item['predicted_docids'].append(it[0])

        if top_k is not None and len(item['predicted_docids']) > top_k:
            item['predicted_docids'] = item['predicted_docids'][:top_k]
Esempio n. 13
0
def convert_to_formatted_sent(zipped_s_id_list,
                              evidence_set,
                              contain_head=True,
                              id_tokenized=True):
    sent_list = []
    for sent, sid in zipped_s_id_list:
        sent_item = dict()

        cur_sent = sent
        doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1])
        # print(sent, doc_id, ln)
        if contain_head:
            if not id_tokenized:
                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))
            else:
                t_doc_id_natural_format = common.doc_id_to_tokenized_text(
                    doc_id)

            if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower():
                cur_sent = f"{t_doc_id_natural_format} <t> " + sent

            sent_item['text'] = cur_sent
            sent_item['sid'] = doc_id + c_scorer.SENT_LINE + str(ln)
            # sid is '[doc_id]<SENT_LINE>[line_number]'
            if evidence_set is not None:
                if (doc_id, ln) in evidence_set:
                    sent_item['selection_label'] = "true"
                else:
                    sent_item['selection_label'] = "false"
            else:
                sent_item['selection_label'] = "hidden"

            sent_list.append(sent_item)
        else:
            sent_list.append(sent_item)

    # for s in sent_list:
    # print(s['text'][:20], s['selection_label'])

    return sent_list
def disabuigation_training_build(item, cursor, contain_first_sentence=False):
    doc_t_list = [it[0] for it in item['prioritized_docids']]
    evidence_group = check_sentences.check_and_clean_evidence(item)
    all_true_t_list = set()
    t_claim = ' '.join(item['claim_tokens'])
    for ground_truth_evid in evidence_group:
        # print(ground_truth_evid)
        true_t_list = set([it[0] for it in ground_truth_evid])
        all_true_t_list = set.union(all_true_t_list, true_t_list)
    all_true_t_list = list(all_true_t_list)

    positive_list = []
    negative_list = []
    eid = item['id']

    for doc_id in all_true_t_list:
        if '-LRB-' in doc_id and common.doc_id_to_tokenized_text(
                doc_id) not in t_claim:
            positive_list.append(doc_id)

    for doc_id in doc_t_list:
        if '-LRB-' in doc_id and doc_id not in all_true_t_list:
            negative_list.append(doc_id)

    # for doc_id in all_true_t_list:
    #     if '-LRB-' in doc_id and doc_id not in claim:
    #         positive_list.append(doc_id)
    #
    # for doc_id in doc_t_list:
    #     if '-LRB-' in doc_id and doc_id not in all_true_t_list:
    #         negative_list.append(doc_id)

    # print("id:", eid)
    # print("Pos:", positive_list)
    # print("Neg:", negative_list)
    # print("Claim:", t_claim)
    return make_examples(eid,
                         positive_list,
                         negative_list,
                         t_claim,
                         cursor,
                         contain_first_sentence=contain_first_sentence)