Ejemplo n.º 1
0
def evidence_list_to_text(cursor,
                          evidences,
                          contain_head=True,
                          id_tokenized=False):
    current_evidence_text = []
    evidences = sorted(evidences, key=lambda x: (x[0], x[1]))

    cur_head = 'DO NOT INCLUDE THIS FLAG'

    for doc_id, line_num in evidences:

        _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

        if contain_head and cur_head != doc_id:
            cur_head = doc_id

            if not id_tokenized:
                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))
            else:
                t_doc_id_natural_format = common.doc_id_to_tokenized_text(
                    doc_id)

            if line_num != 0:
                current_evidence_text.append(f"{t_doc_id_natural_format} <t>")

        # Important change move one line below: July 16
        current_evidence_text.append(e_text)

    # print(current_evidence_text)

    return ' '.join(current_evidence_text)
Ejemplo n.º 2
0
def convert_to_normalized_format(cursor, e_list, contain_head=True):
    r_list = []
    for evidences in e_list:
        current_evidence = []
        cur_head = 'DO NOT INCLUDE THIS FLAG'
        # if len(evidences) >= 2:
        #     print("!!!")

        # This is important sorting of all evidences.
        evidences = sorted(evidences, key=lambda x: (x[0], x[1]))
        # print(evidences)

        for doc_id, line_num in evidences:

            _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

            if contain_head and cur_head != doc_id:
                cur_head = doc_id

                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))

                if line_num != 0:
                    current_evidence.append(f"{t_doc_id_natural_format} .")

            # print(e_text)
            current_evidence.append(e_text)
        # print(current_evidence)
        r_list.append(' '.join(current_evidence))

    return r_list
Ejemplo n.º 3
0
def sample_for_verifiable(cursor, e_list, contain_head=True):
    r_list = []
    for evidences in e_list:
        current_evidence = []
        cur_head = 'DO NOT INCLUDE THIS FLAG'
        # if len(evidences) >= 2:
        #     print("!!!")
        for doc_id, line_num in evidences:

            _, e_text, _ = fever_db.get_evidence(cursor, doc_id, line_num)

            if contain_head and cur_head != doc_id:
                cur_head = doc_id

                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))

                if line_num != 0:
                    current_evidence.append(f"{t_doc_id_natural_format} .")

            # print(e_text)
            current_evidence.append(e_text)
        # print(current_evidence)
        r_list.append(' '.join(current_evidence))

    return r_list
Ejemplo n.º 4
0
    def pageview_spiral_aside_rule(self):
        if not hasattr(self, 'wiki_pv'):
            print("Reload wiki pageview dict")
            self.wiki_pv = WikiPageviews()

        item = self.item
        docid_groups = [[i[0] for i in it] \
                        for _, it in item['structured_docids_aside'].items()]
        changed = False
        for key, group_prio_docids in item['structured_docids_aside'].items():
            group_docids = [it[0] for it in group_prio_docids]
            if len(group_docids) > 1:
                changed = True
                all_scores = map(lambda x: self.wiki_pv[fever_db.convert_brc(x)],
                                 group_docids)
                all_scores = np.array(list(all_scores))
                prios = np.argsort(all_scores)[::-1]
                new_gpd = []
                for i, p in enumerate(prios):
                    # new_gpd.append((group_prio_docids[p][0],
                    #                 group_prio_docids[p][1] + \
                    #                     max(1.0 - i*0.2, 0)))
                    new_gpd.append((group_prio_docids[p][0],
                                    max(1.0 - i*0.2, 0)))
                item['structured_docids_aside'][key] = new_gpd

        if changed:
            finded_keys = item['structured_docids_aside'].values()
            finded_keys = set([i for ii in finded_keys for i in ii]) \
                          if len(finded_keys) > 0 else set(finded_keys)
            item['prioritized_docids_aside'] = list(finded_keys)
        return self
Ejemplo n.º 5
0
def tokenize_doc_id(doc_id, tokenizer):
    # path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
    # print(path_stanford_corenlp_full_2017_06_09)
    #
    # drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
    # tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])

    doc_id_natural_format = fever_db.convert_brc(doc_id).replace('_', ' ')
    tokenized_doc_id = e_tokenize(doc_id_natural_format, tokenizer)
    t_doc_id_natural_format = tokenized_doc_id.words()
    lemmas = tokenized_doc_id.lemmas()
    return t_doc_id_natural_format, lemmas
def parse_doc_id(doc_id, tokenizer=None):
    doc_id = convert_brc(doc_id)
    doc_id = doc_id.replace('_', ' ')

    tokens = None
    lemmas = None

    if tokenizer is not None:
        tok_r = tokenizer.tokenize(doc_id)
        tokens = tok_r.words()
        lemmas = tok_r.lemmas()

    return tokens, lemmas
def pageview_analysis():
    from chaonan_src._doc_retrieval.item_rules import ItemRuleBuilder
    from chaonan_src._utils.doc_utils import read_jsonl
    from utils.fever_db import convert_brc

    wiki_pv = WikiPageviews()
    d_list = read_jsonl(
        "../../../results/doc_retri/docretri.titlematch/dev.jsonl")
    gt_evidences, pre_evidences = [], []
    for item in d_list:
        gt_evidences.extend(ItemRuleBuilder\
            .get_all_docid_in_evidence(item['evidence']))
        pre_evidences.extend([it[0] for it in item['prioritized_docids']])
    gt_evidences = set(gt_evidences)
    pre_evidences = set(pre_evidences)

    gt_count = [wiki_pv[convert_brc(it)] for it in gt_evidences]
    pre_count = [wiki_pv[convert_brc(it)] for it in pre_evidences]

    from IPython import embed
    embed()
    import os
    os._exit(1)
def did_to_keys(doc_id, tokenizer=None):
    doc_id = convert_brc(doc_id)
    doc_id = doc_id.replace('_', ' ')
    id_keys = []
    # id_keys.append(doc_id)
    lemmas = None
    entities = None
    if tokenizer is not None:
        tok_r = tokenizer.tokenize(doc_id)
        to_key = ' '.join(tok_r.words())
        id_keys.append(to_key)

        lemmas = tok_r.lemmas()
        entities = tok_r.entity_groups()

    return list(set(id_keys)), lemmas, entities
Ejemplo n.º 9
0
def convert_to_formatted_sent(zipped_s_id_list,
                              evidence_set,
                              contain_head=True,
                              id_tokenized=True):
    sent_list = []
    for sent, sid in zipped_s_id_list:
        sent_item = dict()

        cur_sent = sent
        doc_id, ln = sid.split('(-.-)')[0], int(sid.split('(-.-)')[1])
        # print(sent, doc_id, ln)
        if contain_head:
            if not id_tokenized:
                doc_id_natural_format = fever_db.convert_brc(doc_id).replace(
                    '_', ' ')
                t_doc_id_natural_format = ' '.join(
                    easy_tokenize(doc_id_natural_format))
            else:
                t_doc_id_natural_format = common.doc_id_to_tokenized_text(
                    doc_id)

            if ln != 0 and t_doc_id_natural_format.lower() not in sent.lower():
                cur_sent = f"{t_doc_id_natural_format} <t> " + sent

            sent_item['text'] = cur_sent
            sent_item['sid'] = doc_id + c_scorer.SENT_LINE + str(ln)
            # sid is '[doc_id]<SENT_LINE>[line_number]'
            if evidence_set is not None:
                if (doc_id, ln) in evidence_set:
                    sent_item['selection_label'] = "true"
                else:
                    sent_item['selection_label'] = "false"
            else:
                sent_item['selection_label'] = "hidden"

            sent_list.append(sent_item)
        else:
            sent_list.append(sent_item)

    # for s in sent_list:
    # print(s['text'][:20], s['selection_label'])

    return sent_list
Ejemplo n.º 10
0
    def pageview_rule(self):
        """Assign high priority to frequently viewed pages
        """
        if not hasattr(self, 'wiki_pv'):
            print("Reload wiki pageview dict")
            self.wiki_pv = WikiPageviews()

        item = self.item
        docid_groups = [[i[0] for i in it] \
                        for _, it in item['structured_docids'].items()]

        for key, group_prio_docids in item['structured_docids'].items():
            group_docids = [it[0] for it in group_prio_docids]
            all_scores = map(lambda x: self.wiki_pv[convert_brc(x)],
                             group_docids)
            all_scores = np.array(list(all_scores))
            prios = np.argsort(all_scores)[::-1]
            new_gpd = []
            for i, p in enumerate(prios):
                # new_gpd.append((group_prio_docids[p][0],
                #                 group_prio_docids[p][1] + \
                #                     max(1.0 - i*0.2, 0)))

                new_gpd.append((group_prio_docids[p][0], int(all_scores[p])))
            item['structured_docids'][key] = new_gpd

        try:
            finded_keys = item['structured_docids'].values()
            finded_keys = set([i for ii in finded_keys for i in ii]) \
                          if len(finded_keys) > 0 else set(finded_keys)
            item['prioritized_docids'] = list(finded_keys)
        except Exception as e:
            from IPython import embed
            embed()
            import os
            os._exit(1)
        return self