예제 #1
0
    def expand_from_preext_sent_rule(self):
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        if not hasattr(self, 'preext_sent_dict'):
            d_list = read_jsonl(config.RESULT_PATH / \
                "sent_retri_nn/2018_07_17_16-34-19_r/train_scale(0.1).jsonl")
            self.preext_sent_dict = {item['id']: item for item in d_list}
        item = self.item

        # if len(item['prioritized_docids']) < 5:
        new_pdocids = []
        structured_docids_sent = {}
        sent_ids = self.preext_sent_dict[item['id']]['scored_sentids']
        for sent_id, score, probability in sent_ids:
            docid, sent_ind = sent_id.split('<SENT_LINE>')
            sent_ind = int(sent_ind)
            id_list, sent_list, sent_links = \
                fever_db.get_evidence(self.cursor,
                                      docid,
                                      sent_ind)
            sent_links = json.loads(sent_links)
            all_links = np.array(sent_links)
            all_links = np.array(all_links)
            all_links = all_links.reshape(-1, 2)[:, 1]
            all_links = list(map(fever_db.reverse_convert_brc, all_links))
            all_links = list(map(lambda x: x.replace(' ', '_'), all_links))
            prio_docids = [(id_link, score) for id_link in all_links]
            new_pdocids.extend(prio_docids)
            structured_docids_sent.update({sent_id: prio_docids})
        item['prioritized_docids_sent'] = new_pdocids
        item['structured_docids_sent'] = structured_docids_sent
        return self
예제 #2
0
    def disambiguous_from_preext_sent_rule(self):
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        if not hasattr(self, 'preext_sent_dict'):
            d_list = read_jsonl(config.RESULT_PATH / \
                "sent_retri_nn/2018_07_17_16-34-19_r/train_sent.jsonl")
            self.preext_sent_dict = {item['id']: item for item in d_list}
        item = self.item

        if len(item['prioritized_docids']) > 60:
            sent_ids = self.preext_sent_dict[item['id']]['']
        return self
예제 #3
0
def tf_idf_rank(args, top_k=5):
    dev_path = config.PRO_ROOT / \
               'results_old/doc_retri/docretri.basic.nopageview/dev.jsonl'

    cursor = get_cursor()
    d_list = read_jsonl(dev_path)

    d_list_test = d_list

    for i, item in enumerate(spcl(d_list_test)):
        all_sent = []
        all_ids = [it[0] for it in item['prioritized_docids']]

        try:

            for doc_id in all_ids:
                r_list, _ = get_all_sent_by_doc_id(cursor,
                                                   doc_id,
                                                   with_h_links=False)
                all_sent.append(' '.join(r_list))

            ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram,
                                          all_sent)
        except Exception as e:
            if i - 1 >= 0:
                print(f'Early quit at {i-1} because of {e}')
                save_path = config.RESULT_PATH / \
                            'doc_retri/docretri.tfidfrank/' \
                            f'dev_quit_dump_{uuid4()}.json'
                DocRetrievalExperiment.dump_results(d_list_test[:i], save_path)
            raise e

        rank_ind, rank_score = \
            ranker.closest_docs(' '.join(item['claim_tokens']), k=100)
        id_score_dict = {docid: 0 for docid in all_ids}
        id_score_dict.update({all_ids[ri]: rs \
                              for ri, rs in zip(rank_ind, rank_score)})
        item['prioritized_docids'] = [(k, v) for k, v in id_score_dict.items()]
        item['predicted_docids'] = \
                list(set([k for k, v \
                            in sorted(item['prioritized_docids'],
                                      key=lambda x: (-x[1], x[0]))][:top_k]))

    save_path = config.RESULT_PATH / 'doc_retri/docretri.tfidfrank/dev.json'
    DocRetrievalExperiment.dump_results(d_list_test, save_path)
def main():
    doc_exp = DocRetrievalExperimentTwoStep()

    d_list = read_jsonl(config.FEVER_DEV_JSONL)
    # d_list = read_jsonl(config.FEVER_TRAIN_JSONL)

    doc_exp.sample_answer_with_priority(d_list)
    doc_exp.print_eval(d_list)
    doc_exp.feed_sent_file(
        "../../results/"
        "sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl")
    doc_exp.find_sent_link_with_priority(d_list, predict=True)
    doc_exp.print_eval(d_list)

    # path = "../../results/doc_retri/docretri.spiral.aside/train.jsonl"
    # doc_exp.dump_results(d_list, path)

    from IPython import embed
    embed()
    import os
    os._exit(1)
예제 #5
0
def main():
    import os
    from chaonan_src._config import old_result_path
    from chaonan_src._utils.doc_utils import read_jsonl
    from chaonan_src._utils.spcl import spcl

    # pageview_path = os.path.join(config.RESULT_PATH,
    #                              'doc_retri/docretri.rawpageview/train.jsonl')
    pageview_path = config.RESULT_PATH / \
                    'doc_retri/docretri.rawpageview/dev.jsonl'
    # ori_path = os.path.join(old_result_path,
    #                         'doc_retri/docretri.pageview/dev.jsonl')

    # d_list = read_jsonl(config.FEVER_DEV_JSONL)
    # item_rb_exp = ItemRuleRawPageview()
    # doc_exp = DocRetrievalExperiment(item_rb_exp)
    # doc_exp.sample_answer_with_priority(d_list)
    # doc_exp.dump_results(d_list, save_path)
    # doc_exp.print_eval(d_list)

    # d_list_ori = read_jsonl(ori_path)
    d_list = read_jsonl(pageview_path)

    # DocRetrievalExperiment.dump_results(d_list, pageview_path)
    # item_rb = ItemRuleRawPageview()

    top_k = 5
    for item in spcl(d_list):
        item['predicted_docids'] = \
                list(set([k for k, v \
                            in sorted(item['docid_pageviews'],
                                      key=lambda x: (-x[1], x[0]))][:top_k]))
    # DocRetrievalExperiment.dump_results(d_list, save_path)
    DocRetrievalExperiment.print_eval(d_list)
    from IPython import embed
    embed()
    import os
    os._exit(1)
def pageview_analysis():
    from chaonan_src._doc_retrieval.item_rules import ItemRuleBuilder
    from chaonan_src._utils.doc_utils import read_jsonl
    from utils.fever_db import convert_brc

    wiki_pv = WikiPageviews()
    d_list = read_jsonl(
        "../../../results/doc_retri/docretri.titlematch/dev.jsonl")
    gt_evidences, pre_evidences = [], []
    for item in d_list:
        gt_evidences.extend(ItemRuleBuilder\
            .get_all_docid_in_evidence(item['evidence']))
        pre_evidences.extend([it[0] for it in item['prioritized_docids']])
    gt_evidences = set(gt_evidences)
    pre_evidences = set(pre_evidences)

    gt_count = [wiki_pv[convert_brc(it)] for it in gt_evidences]
    pre_count = [wiki_pv[convert_brc(it)] for it in pre_evidences]

    from IPython import embed
    embed()
    import os
    os._exit(1)
예제 #7
0
 def feed_sent_score_result(self, path):
     d_list = read_jsonl(path)
     self.preext_sent_dict = {item['id']: item for item in d_list}
예제 #8
0
            if predict:
                porg = \
                    set([k for k, v \
                           in sorted(item['prioritized_docids'],
                                     key=lambda x: (-x[1], x[0]))][:top_k])
                paside = \
                    set([k for k, v \
                            in sorted(item['prioritized_docids_aside'],
                                      key=lambda x: (-x[1], x[0]))][:top_k])
                item['predicted_docids'] = list(porg | paside)
                item['predicted_docids_origin'] = list(porg)
                item['predicted_docids_aside'] = list(paside)
            d_list[i] = item
        ws_second.close()


if __name__ == "__main__":
    from chaonan_src._utils.doc_utils import read_jsonl
    import config

    d_list = read_jsonl(config.FEVER_DEV_JSONL)
    d_list_test = d_list[:20]
    DocRetrievalClient.sample_answer_with_priority(d_list_test)
    DocRetrievalClient.print_eval(d_list_test)

    # DocRetrievalClient.feed_sent_file(
    #     "../../results"
    #     "/sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl")
    # DocRetrievalClient.find_sent_link_with_priority(d_list_test, predict=True)
    DocRetrievalExperiment.print_eval(d_list_test)
예제 #9
0
def eval():
    save_path = config.RESULT_PATH / 'doc_retri/docretri.tfidfrank/dev_toy.json'
    d_list = read_jsonl(save_path)
    DocRetrievalExperiment.print_eval(d_list)
예제 #10
0
 def __init__(self, jl_path, train=False):
     self.d_list = read_jsonl(jl_path)
     self.train = train
     self.initialized = False
     self.batch_size = None