def sample_answer_with_priority(self, d_list, top_k=5):
     for i, item in enumerate(spcl(d_list)):
         self.item_rb.rules(item)
         item['predicted_docids'] = \
             list(set([k for k, v \
                         in sorted(item['prioritized_docids'],
                                   key=lambda x: (-x[1], x[0]))][:top_k]))
Ejemplo n.º 2
0
    def sample_answer_with_priority(cls, d_list, top_k=5):
        ws_first = websocket.create_connection(cls.first_round_path)

        for i in spcl(range(len(d_list))):
        # for i in spcl(range(10)):
            ws_first.send(json.dumps(d_list[i]))
            item = json.loads(ws_first.recv())
            item['predicted_docids'] = \
                list(set([k for k, v \
                            in sorted(item['prioritized_docids'],
                                      key=lambda x: (-x[1], x[0]))][:top_k]))
            d_list[i] = item
        ws_first.close()
Ejemplo n.º 3
0
def tf_idf_rank(args, top_k=5):
    dev_path = config.PRO_ROOT / \
               'results_old/doc_retri/docretri.basic.nopageview/dev.jsonl'

    cursor = get_cursor()
    d_list = read_jsonl(dev_path)

    d_list_test = d_list

    for i, item in enumerate(spcl(d_list_test)):
        all_sent = []
        all_ids = [it[0] for it in item['prioritized_docids']]

        try:

            for doc_id in all_ids:
                r_list, _ = get_all_sent_by_doc_id(cursor,
                                                   doc_id,
                                                   with_h_links=False)
                all_sent.append(' '.join(r_list))

            ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram,
                                          all_sent)
        except Exception as e:
            if i - 1 >= 0:
                print(f'Early quit at {i-1} because of {e}')
                save_path = config.RESULT_PATH / \
                            'doc_retri/docretri.tfidfrank/' \
                            f'dev_quit_dump_{uuid4()}.json'
                DocRetrievalExperiment.dump_results(d_list_test[:i], save_path)
            raise e

        rank_ind, rank_score = \
            ranker.closest_docs(' '.join(item['claim_tokens']), k=100)
        id_score_dict = {docid: 0 for docid in all_ids}
        id_score_dict.update({all_ids[ri]: rs \
                              for ri, rs in zip(rank_ind, rank_score)})
        item['prioritized_docids'] = [(k, v) for k, v in id_score_dict.items()]
        item['predicted_docids'] = \
                list(set([k for k, v \
                            in sorted(item['prioritized_docids'],
                                      key=lambda x: (-x[1], x[0]))][:top_k]))

    save_path = config.RESULT_PATH / 'doc_retri/docretri.tfidfrank/dev.json'
    DocRetrievalExperiment.dump_results(d_list_test, save_path)
 def find_sent_link_with_priority(self, d_list, top_k=5, predict=False):
     for item in spcl(d_list):
         self.item_rb.second_only_rules(item)
         pids = [it[0] for it in item['prioritized_docids']]
         item['prioritized_docids_aside'] = \
             [it for it in item['prioritized_docids_aside']\
                 if it[0] not in pids]
         if predict:
             porg = \
                 set([k for k, v \
                        in sorted(item['prioritized_docids'],
                                  key=lambda x: (-x[1], x[0]))][:top_k])
             paside = \
                 set([k for k, v \
                         in sorted(item['prioritized_docids_aside'],
                                   key=lambda x: (-x[1], x[0]))][:top_k])
             item['predicted_docids'] = list(porg | paside)
             item['predicted_docids_origin'] = list(porg)
             item['predicted_docids_aside'] = list(paside)
Ejemplo n.º 5
0
def main():
    import os
    from chaonan_src._config import old_result_path
    from chaonan_src._utils.doc_utils import read_jsonl
    from chaonan_src._utils.spcl import spcl

    # pageview_path = os.path.join(config.RESULT_PATH,
    #                              'doc_retri/docretri.rawpageview/train.jsonl')
    pageview_path = config.RESULT_PATH / \
                    'doc_retri/docretri.rawpageview/dev.jsonl'
    # ori_path = os.path.join(old_result_path,
    #                         'doc_retri/docretri.pageview/dev.jsonl')

    # d_list = read_jsonl(config.FEVER_DEV_JSONL)
    # item_rb_exp = ItemRuleRawPageview()
    # doc_exp = DocRetrievalExperiment(item_rb_exp)
    # doc_exp.sample_answer_with_priority(d_list)
    # doc_exp.dump_results(d_list, save_path)
    # doc_exp.print_eval(d_list)

    # d_list_ori = read_jsonl(ori_path)
    d_list = read_jsonl(pageview_path)

    # DocRetrievalExperiment.dump_results(d_list, pageview_path)
    # item_rb = ItemRuleRawPageview()

    top_k = 5
    for item in spcl(d_list):
        item['predicted_docids'] = \
                list(set([k for k, v \
                            in sorted(item['docid_pageviews'],
                                      key=lambda x: (-x[1], x[0]))][:top_k]))
    # DocRetrievalExperiment.dump_results(d_list, save_path)
    DocRetrievalExperiment.print_eval(d_list)
    from IPython import embed
    embed()
    import os
    os._exit(1)
Ejemplo n.º 6
0
 def find_sent_link_with_priority(cls, d_list, top_k=5, predict=False):
     ws_second = websocket.create_connection(cls.second_round_path)
     for i in spcl(range(len(d_list))):
         ws_second.send(json.dumps(d_list[i]))
         item = json.loads(ws_second.recv())
         pids = [it[0] for it in item['prioritized_docids']]
         item['prioritized_docids_aside'] = \
             [it for it in item['prioritized_docids_aside']\
                 if it[0] not in pids]
         if predict:
             porg = \
                 set([k for k, v \
                        in sorted(item['prioritized_docids'],
                                  key=lambda x: (-x[1], x[0]))][:top_k])
             paside = \
                 set([k for k, v \
                         in sorted(item['prioritized_docids_aside'],
                                   key=lambda x: (-x[1], x[0]))][:top_k])
             item['predicted_docids'] = list(porg | paside)
             item['predicted_docids_origin'] = list(porg)
             item['predicted_docids_aside'] = list(paside)
         d_list[i] = item
     ws_second.close()
def count_pageviews():
    """Count pageviews and save to a dictionary
    """
    fname = save_path_root + get_file_name(7, 19, 12)
    en_wiki_count = defaultdict(lambda: 0)

    all_files = glob(save_path_root + '**/*.gz', recursive=True)
    for fname in spcl(all_files, every=1):
        with gzip.open(fname, mode='rb') as f:
            for l in f:
                if l.startswith(b'en'):
                    lsplits = l.split()
                    if len(lsplits) != 4:
                        continue
                    _, docid, count, _ = lsplits
                    docid = docid.decode('utf-8')
                    count = int(count)
                    en_wiki_count[docid] += count

    print("All done! Cheers!")
    pv_dump_path = str(config.RESULT_PATH) + "/chaonan99/pageviews.pkl"
    pickle.dump(dict(en_wiki_count), open(pv_dump_path, 'wb'))
    from IPython import embed
    embed()