Beispiel #1
0
def evidence_list_to_text(cursor,
                          evidences,
                          contain_head=True,
                          id_tokenized=False):
    current_evidence_text = []
    evidences = sorted(evidences, key=lambda x: (x[0], x[1]))

    cur_head = 'DO NOT INCLUDE THIS FLAG'

    for doc_id, line_num in evidences:

        _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

        if contain_head and cur_head != doc_id:
            cur_head = doc_id

            if not id_tokenized:
                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))
            else:
                t_doc_id_natural_format = common.doc_id_to_tokenized_text(
                    doc_id)

            if line_num != 0:
                current_evidence_text.append(f"{t_doc_id_natural_format} <t>")

        # Important change move one line below: July 16
        current_evidence_text.append(e_text)

    # print(current_evidence_text)

    return ' '.join(current_evidence_text)
Beispiel #2
0
def convert_to_normalized_format(cursor, e_list, contain_head=True):
    r_list = []
    for evidences in e_list:
        current_evidence = []
        cur_head = 'DO NOT INCLUDE THIS FLAG'
        # if len(evidences) >= 2:
        #     print("!!!")

        # This is important sorting of all evidences.
        evidences = sorted(evidences, key=lambda x: (x[0], x[1]))
        # print(evidences)

        for doc_id, line_num in evidences:

            _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

            if contain_head and cur_head != doc_id:
                cur_head = doc_id

                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))

                if line_num != 0:
                    current_evidence.append(f"{t_doc_id_natural_format} .")

            # print(e_text)
            current_evidence.append(e_text)
        # print(current_evidence)
        r_list.append(' '.join(current_evidence))

    return r_list
Beispiel #3
0
def sample_for_verifiable(cursor, e_list, contain_head=True):
    r_list = []
    for evidences in e_list:
        current_evidence = []
        cur_head = 'DO NOT INCLUDE THIS FLAG'
        # if len(evidences) >= 2:
        #     print("!!!")
        for doc_id, line_num in evidences:

            _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

            if contain_head and cur_head != doc_id:
                cur_head = doc_id

                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))

                if line_num != 0:
                    current_evidence.append(f"{t_doc_id_natural_format} .")

            # print(e_text)
            current_evidence.append(e_text)
        # print(current_evidence)
        r_list.append(' '.join(current_evidence))

    return r_list
Beispiel #4
0
    def expand_from_preext_sent_rule(self):
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        if not hasattr(self, 'preext_sent_dict'):
            d_list = read_jsonl(config.RESULT_PATH / \
                "sent_retri_nn/2018_07_17_16-34-19_r/train_scale(0.1).jsonl")
            self.preext_sent_dict = {item['id']: item for item in d_list}
        item = self.item

        # if len(item['prioritized_docids']) < 5:
        new_pdocids = []
        structured_docids_sent = {}
        sent_ids = self.preext_sent_dict[item['id']]['scored_sentids']
        for sent_id, score, probability in sent_ids:
            docid, sent_ind = sent_id.split('<SENT_LINE>')
            sent_ind = int(sent_ind)
            id_list, sent_list, sent_links = \
                fever_db.get_evidence(self.cursor,
                                      docid,
                                      sent_ind)
            sent_links = json.loads(sent_links)
            all_links = np.array(sent_links)
            all_links = np.array(all_links)
            all_links = all_links.reshape(-1, 2)[:, 1]
            all_links = list(map(fever_db.reverse_convert_brc, all_links))
            all_links = list(map(lambda x: x.replace(' ', '_'), all_links))
            prio_docids = [(id_link, score) for id_link in all_links]
            new_pdocids.extend(prio_docids)
            structured_docids_sent.update({sent_id: prio_docids})
        item['prioritized_docids_sent'] = new_pdocids
        item['structured_docids_sent'] = structured_docids_sent
        return self
Beispiel #5
0
    def expand_from_preext_sent_rule(self):
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        if not hasattr(self, 'preext_sent_dict'):
            d_list = load_data(config.RESULT_PATH / \
                "sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl")
            self.preext_sent_dict = {item['id']: item for item in d_list}
        item = self.item

        # if len(item['prioritized_docids']) < 5:
        new_pdocids = copy(item['prioritized_docids'])
        sent_ids = self.preext_sent_dict[item['id']]['predicted_sentids']
        for sent_id in sent_ids:
            docid, sent_ind = sent_id.split('<SENT_LINE>')
            sent_ind = int(sent_ind)
            id_list, sent_list, sent_links = \
                fever_db.get_evidence(self.cursor,
                                      docid,
                                      sent_ind)
            sent_links = json.loads(sent_links)
            all_links = np.array(sent_links)
            all_links = np.array(all_links)
            all_links = all_links.reshape(-1, 2)[:, 1]
            all_links = list(map(reverse_convert_brc, all_links))
            new_pdocids.extend([(id_link, 1.0) \
                for id_link in all_links])
        item['prioritized_docids'] = new_pdocids
        return self
Beispiel #6
0
def evidence_list_to_text_list(cursor, evidences, contain_head=True):
    # One evidence one text and len(evidences) == len(text_list)
    current_evidence_text_list = []
    evidences = sorted(evidences, key=lambda x: (x[0], x[1]))

    cur_head = 'DO NOT INCLUDE THIS FLAG'

    for doc_id, line_num in evidences:

        _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

        cur_text = ""

        if contain_head and cur_head != doc_id:
            cur_head = doc_id

            t_doc_id_natural_format = common.doc_id_to_tokenized_text(doc_id)

            if line_num != 0:
                cur_text = f"{t_doc_id_natural_format} <t> "

        # Important change move one line below: July 16
        # current_evidence_text.append(e_text)
        cur_text = cur_text + e_text

        current_evidence_text_list.append(cur_text)

    assert len(evidences) == len(current_evidence_text_list)
    return current_evidence_text_list
Beispiel #7
0
    def expand_from_preext_sent_rule(self):
        item = self.item

        # if len(item['prioritized_docids']) < 5:
        new_pdocids = []
        structured_docids_sent = {}
        sent_ids = self.preext_sent_dict[item['id']]['scored_sentids']
        for sent_id, score, probability in sent_ids:
            docid, sent_ind = sent_id.split('<SENT_LINE>')
            sent_ind = int(sent_ind)
            id_list, sent_list, sent_links = fever_db.get_evidence(
                self.cursor, docid, sent_ind)
            sent_links = json.loads(sent_links)
            all_links = np.array(sent_links)
            all_links = np.array(all_links)
            all_links = all_links.reshape(-1, 2)[:, 1]
            all_links = list(map(fever_db.reverse_convert_brc, all_links))
            all_links = list(map(lambda x: x.replace(' ', '_'), all_links))
            prio_docids = [(id_link, score) for id_link in all_links]
            new_pdocids.extend(prio_docids)
            structured_docids_sent.update({sent_id: prio_docids})
        item['prioritized_docids_sent'] = new_pdocids
        item['structured_docids_sent'] = structured_docids_sent
        return self
Beispiel #8
0
def get_full_list(tokenized_data_file,
                  additional_data_file,
                  pred=False,
                  top_k=None):
    """
    This method will select all the sentence from upstream doc retrieval and label the correct evident as true
    :param tokenized_data_file: Remember this is tokenized data with original format containing 'evidence'
    :param additional_data_file:    This is the data after document retrieval.
                                    This file need to contain *"predicted_docids"* field.
    :return:
    """
    cursor = fever_db.get_cursor()
    d_list = load_jsonl(tokenized_data_file)

    if not isinstance(additional_data_file, list):
        additional_d_list = load_jsonl(additional_data_file)
    else:
        additional_d_list = additional_data_file

    if top_k is not None:
        print("Upstream document number truncate to:", top_k)
        trucate_item(additional_d_list, top_k=top_k)

    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    full_data_list = []

    for item in tqdm(d_list):
        doc_ids = additional_data_dict[item['id']]["predicted_docids"]

        if not pred:
            if item['evidence'] is not None:
                e_list = utils.check_sentences.check_and_clean_evidence(item)
                all_evidence_set = set(
                    itertools.chain.from_iterable(
                        [evids.evidences_list for evids in e_list]))
            else:
                all_evidence_set = None
            # print(all_evidence_set)
            r_list = []
            id_list = []

            if all_evidence_set is not None:
                for doc_id, ln in all_evidence_set:
                    _, text, _ = fever_db.get_evidence(cursor, doc_id, ln)
                    r_list.append(text)
                    id_list.append(doc_id + '(-.-)' + str(ln))

        else:  # If pred, then reset to not containing ground truth evidence.
            all_evidence_set = None
            r_list = []
            id_list = []

        for doc_id in doc_ids:
            cur_r_list, cur_id_list = fever_db.get_all_sent_by_doc_id(
                cursor, doc_id, with_h_links=False)
            # Merging to data list and removing duplicate
            for i in range(len(cur_r_list)):
                if cur_id_list[i] in id_list:
                    continue
                else:
                    r_list.append(cur_r_list[i])
                    id_list.append(cur_id_list[i])

        assert len(id_list) == len(set(id_list))  # check duplicate
        assert len(r_list) == len(id_list)

        zipped_s_id_list = list(zip(r_list, id_list))
        # Sort using id
        # sorted(evidences_set, key=lambda x: (x[0], x[1]))
        zipped_s_id_list = sorted(zipped_s_id_list,
                                  key=lambda x: (x[1][0], x[1][1]))

        all_sent_list = convert_to_formatted_sent(zipped_s_id_list,
                                                  all_evidence_set,
                                                  contain_head=True,
                                                  id_tokenized=True)
        cur_id = item['id']
        for i, sent_item in enumerate(all_sent_list):
            sent_item['selection_id'] = str(cur_id) + "<##>" + str(
                sent_item['sid'])
            sent_item['query'] = item['claim']

            if 'label' in item.keys():
                sent_item['claim_label'] = item['label']

            full_data_list.append(sent_item)

    return full_data_list