コード例 #1
0
def precompute_forward_items_and_cache():
    # 3 places need to switch from dev to train !!!

    is_training = False
    doc_results = common.load_json(
        # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_train_doc_retrieval_v8_before_multihop_filtering.json")
        # config.PRO_ROOT / "results/doc_retri_results/doc_retrieval_final_v8/hotpot_dev_doc_retrieval_v8_before_multihop_filtering.json")
        config.PRO_ROOT /
        "results/doc_retri_results/doc_retrieval_final_v8/hotpot_test_doc_retrieval_v8_before_multihop_filtering.json"
    )
    doc_results = results_multihop_filtering(doc_results,
                                             multihop_retrieval_top_k=3,
                                             strict_mode=True)

    # db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB)

    t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB)

    # data_list = common.load_json(config.DEV_FULLWIKI_FILE)
    data_list = common.load_json(config.TEST_FULLWIKI_FILE)
    # data_list = common.load_json(config.TRAIN_FILE)
    append_baseline_context(doc_results, data_list)

    fitem_list = build_full_wiki_document_forward_item(doc_results, data_list,
                                                       is_training,
                                                       t_db_cursor, True)

    print(len(fitem_list))
    common.save_jsonl(
        fitem_list, config.PDATA_ROOT / "content_selection_forward" /
        "hotpot_test_p_level_unlabeled.jsonl")
コード例 #2
0
ファイル: raw_text_db.py プロジェクト: tushar117/multihopQA
def iterative_build_raw_text():
    wiki_whole_db_cursor = get_cursor(str(config.WHOLE_WIKI_DB))
    wiki_whole_db_cursor.execute("SELECT * from unnamed")
    total_count = 0
    insert_data_list = []

    db_path = config.WHOLE_WIKI_RAW_TEXT
    conn = sqlite3.connect(str(db_path))
    saving_cursor = conn.cursor()

    for key, value in tqdm(wiki_whole_db_cursor, total=TOTAL_ARTICLE_COUNT):
        cur_item = json.loads(value)
        raw_text = get_raw_text(cur_item)

        article_title = cur_item['title']

        for p_num, paragraph in enumerate(raw_text):
            assert isinstance(paragraph, list)
            p_str = json.dumps(paragraph)
            insert_data_list.append((article_title, p_num, p_str))
            total_count += 1
            conn.commit()

        if len(insert_data_list) >= 5000:
            insert_many_raw_text_table(saving_cursor, insert_data_list)
            insert_data_list = []

        # if total_count >= 10000:
        #     break

    print(total_count)
    insert_many_raw_text_table(saving_cursor, insert_data_list)
    conn.commit()
    conn.close()
コード例 #3
0
def get_sentence_pair(top_k, d_list, p_level_results_list, is_training, debug_mode=False):
    #
    t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB)
    #
    # dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    # dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    dev_list = d_list

    # cur_dev_eval_results_list = common.load_jsonl(
    #     config.PRO_ROOT / "data/p_hotpotqa/hotpotqa_document_level/2019_4_17/dev_p_level_bert_v1_results.jsonl")
    cur_dev_eval_results_list = p_level_results_list

    if debug_mode:
        dev_list = dev_list[:100]
        id_set = set([item['_id'] for item in dev_list])
        cur_dev_eval_results_list = [item for item in p_level_results_list if item['qid'] in id_set]

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, '_id')

    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(cur_dev_eval_results_list, copied_dev_o_dict,
                                                          'qid', 'fid', check=True)
    cur_results_dict_top2 = select_top_k_and_to_results_dict(copied_dev_o_dict, top_k=top_k, filter_value=None)
    # print(cur_results_dict_top2)
    fitems = build_sentence_forward_item(cur_results_dict_top2, dev_list, is_training=is_training,
                                         db_cursor=t_db_cursor)

    return fitems
コード例 #4
0
ファイル: qa_sampler.py プロジェクト: tushar117/multihopQA
def inspect_upstream_eval():
    is_training = True
    debug_mode = True
    d_list = common.load_jsonl(config.OPEN_SQUAD_DEV_GT)
    in_file_name = config.PRO_ROOT / 'saved_models/05-12-08:44:38_mtr_open_qa_p_level_(num_train_epochs:3)/i(2000)|e(2)|squad|top10(0.6909176915799432)|top20(0.7103122043519394)|seed(12)_eval_results.jsonl'
    cur_eval_results_list = common.load_jsonl(in_file_name)
    top_k = 10
    filter_value = 0.1
    t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT)
    match_type = 'string'

    if debug_mode:
        d_list = d_list[:100]
        id_set = set([item['question'] for item in d_list])
        cur_eval_results_list = [
            item for item in cur_eval_results_list if item['qid'] in id_set
        ]

    d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question')
    copied_d_o_dict = copy.deepcopy(d_o_dict)

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict(
        copied_d_o_dict,
        score_field_name='prob',
        top_k=top_k,
        filter_value=filter_value)

    forward_example_items = build_open_qa_forword_item(cur_results_dict_top10,
                                                       d_list, is_training,
                                                       t_cursor, match_type)

    print(forward_example_items)
コード例 #5
0
def whole_wiki_pages_analysis():
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    with SqliteDict(str(config.WHOLE_WIKI_DB),
                    flag='r',
                    encode=json.dumps,
                    decode=json.loads) as whole_wiki_db:
        for key, value in whole_tokenized_db_cursor:
            valid_page = True
            item = json.loads(value)
            # print(item)
            article_title = item['title']
            article_clean_text = item['clean_text']
            article_poss = item['poss']

            abs_index = get_first_paragraph_index(whole_wiki_db[article_title])

            if abs_index == -1:
                valid_page = False
                # print(whole_wiki_db[article_title])
                # This pages is not valid.

            article_term_list = []
            article_poss_list = []

            title_term_list = []
            title_poss_list = []

            abstract_term_list = []
            abstract_poss_list = []

            assert len(article_clean_text) == len(article_poss)

            for p_i, (paragraph_text, paragraph_poss) in enumerate(
                    zip(article_clean_text, article_poss)):
                for sent_text, sent_poss in zip(paragraph_text,
                                                paragraph_poss):
                    if p_i == 0:  # In title.
                        title_term_list.extend(sent_text)
                        title_poss_list.extend(sent_poss)
                        continue  # If the terms are in title, we don't those terms in abstract and article term.
                    else:
                        if p_i == abs_index:  # If the terms are in abstract
                            abstract_term_list.extend(sent_text)
                            abstract_poss_list.extend(sent_poss)

                        article_term_list.extend(sent_text)
                        article_poss_list.extend(sent_poss)

            print("Title:", title_term_list, title_poss_list)

            print(
                "Title:(ngram):",
                get_ngrams(title_term_list,
                           title_poss_list,
                           3,
                           included_tags=POS_INCLUDED))
コード例 #6
0
def prepare_forward_data(dataset_name, tag, is_training, upstream_top_k=20, distant_gt_top_k=2, down_sample_ratio=None,
                         debug=False):
    if dataset_name == 'webq' and tag == 'test':
        gt_d_list_path = config.OPEN_WEBQ_TEST_GT
    elif dataset_name == 'webq' and tag == 'train':
        gt_d_list_path = config.OPEN_WEBQ_TRAIN_GT
    elif dataset_name == 'curatedtrec' and tag == 'test':
        gt_d_list_path = config.OPEN_CURATEDTERC_TEST_GT
    elif dataset_name == 'curatedtrec' and tag == 'train':
        gt_d_list_path = config.OPEN_CURATEDTERC_TRAIN_GT
    elif dataset_name == 'squad' and tag == 'dev':
        gt_d_list_path = config.OPEN_SQUAD_DEV_GT
    elif dataset_name == 'squad' and tag == 'train':
        gt_d_list_path = config.OPEN_SQUAD_TRAIN_GT
    elif dataset_name == 'wikimovie' and tag == 'test':
        gt_d_list_path = config.OPEN_WIKIM_TEST_GT
    elif dataset_name == 'wikimovie' and tag == 'train':
        gt_d_list_path = config.OPEN_WIKIM_TRAIN_GT
    else:
        raise NotImplemented()

    t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT)
    # debug = False
    # upstream_top_k = 20
    # distant_gt_top_k = 2
    # down_sample_ratio = None

    if dataset_name != 'wikimovie':
        upstream_d_list_before_filter = common.load_jsonl(
            config.PRO_ROOT / f"data/p_{dataset_name}/tf_idf_p_level/{dataset_name}_{tag}_para_tfidf.jsonl")
    else:
        upstream_d_list_before_filter = common.load_jsonl(
            config.PRO_ROOT / f"data/p_{dataset_name}/kwm_p_level/{dataset_name}_{tag}_kwm_tfidf.jsonl")

    if debug:
        upstream_d_list_before_filter = upstream_d_list_before_filter[:50]
    upstream_d_list = top_k_filter_score_list(upstream_d_list_before_filter, top_k=upstream_top_k)

    upstream_d_dict = list_dict_data_tool.list_to_dict(upstream_d_list, 'question')

    gt_d_list = common.load_jsonl(gt_d_list_path)
    gt_d_dict = list_dict_data_tool.list_to_dict(gt_d_list, 'question')
    distant_gt_item_list = get_distant_top_k_ground_truth(gt_d_dict, upstream_d_list_before_filter,
                                                          top_k=distant_gt_top_k)
    distant_gt_item_dict = list_dict_data_tool.list_to_dict(distant_gt_item_list, 'qid')

    fitems_list = build_p_level_forward_item(upstream_d_dict, distant_gt_item_dict, upstream_d_list, is_training,
                                             t_cursor)
    if is_training:
        return down_sample_neg(fitems_list, down_sample_ratio)
    else:
        return down_sample_neg(fitems_list, None)
コード例 #7
0
def iterative_build():
    # wiki_abs_db_cursor = get_cursor(str(config.ABS_WIKI_DB))
    wiki_whole_db_cursor = get_cursor(str(config.WHOLE_WIKI_DB))
    wiki_whole_db_cursor.execute("SELECT * from unnamed")
    total_count = 0
    cur_count = 0

    with SqliteDict(str(config.WHOLE_PROCESS_FOR_RINDEX_DB), encode=json.dumps, decode=json.loads) as whole_rindex_db:
        for key, value in tqdm(wiki_whole_db_cursor, total=TOTAL_ARTICLE_COUNT):
            cur_item = json.loads(value)
            # print(cur_item)
            clean_text = item_get_clean_text(cur_item)
            # print(clean_text)
            new_item = dict()
            new_item['title'] = cur_item['title']

            flatten_article_tokens = []

            for p_i, paragraph in enumerate(clean_text):
                # flatten_paragraph_tokens = []
                # paragraph_poss = []
                for s_i, sentence in enumerate(paragraph):
                    flatten_article_tokens.extend(sentence)

                # flatten_article_tokens.extend(flatten_paragraph_tokens)

            flatten_article_poss = spacy_get_pos(flatten_article_tokens)

            cur_ptr = 0
            article_poss = []
            for p_i, paragraph in enumerate(clean_text):
                paragraph_poss = []
                for s_i, sentence in enumerate(paragraph):
                    sentence_poss = []
                    for _ in sentence:
                        sentence_poss.append(flatten_article_poss[cur_ptr])
                        cur_ptr += 1
                    paragraph_poss.append(sentence_poss)
                article_poss.append(paragraph_poss)

            new_item['clean_text'] = clean_text
            new_item['poss'] = article_poss
            whole_rindex_db[new_item['title']] = new_item

            cur_count += 1

            if cur_count % 5000 == 0:
                whole_rindex_db.commit()

        whole_rindex_db.commit()
        whole_rindex_db.close()
コード例 #8
0
ファイル: qa_sampler.py プロジェクト: tushar117/multihopQA
def get_open_qa_item_with_upstream_paragraphs(d_list,
                                              cur_eval_results_list,
                                              is_training,
                                              tokenizer: BertTokenizer,
                                              max_context_length,
                                              max_query_length,
                                              doc_stride=128,
                                              debug_mode=False,
                                              top_k=10,
                                              filter_value=0.1,
                                              match_type='string'):
    t_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_RAW_TEXT)

    if debug_mode:
        d_list = d_list[:100]
        id_set = set([item['question'] for item in d_list])
        cur_eval_results_list = [
            item for item in cur_eval_results_list if item['qid'] in id_set
        ]

    d_o_dict = list_dict_data_tool.list_to_dict(d_list, 'question')
    copied_d_o_dict = copy.deepcopy(d_o_dict)

    list_dict_data_tool.append_subfield_from_list_to_dict(
        cur_eval_results_list, copied_d_o_dict, 'qid', 'fid', check=True)

    cur_results_dict_top10 = od_sample_utils.select_top_k_and_to_results_dict(
        copied_d_o_dict,
        score_field_name='prob',
        top_k=top_k,
        filter_value=filter_value)

    forward_example_items = build_open_qa_forword_item(cur_results_dict_top10,
                                                       d_list, is_training,
                                                       t_cursor, match_type)
    forward_example_items = format_convert(forward_example_items, is_training)
    fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems(
        forward_example_items, tokenizer, is_training, max_context_length,
        max_query_length, doc_stride, False)

    return fitems_dict, read_fitems_list, cur_results_dict_top10['pred_p_list']
コード例 #9
0
def get_qa_item_with_upstream_sentence(d_list,
                                       sentence_level_results,
                                       is_training,
                                       tokenizer: BertTokenizer,
                                       max_context_length,
                                       max_query_length,
                                       doc_stride=128,
                                       debug_mode=False,
                                       top_k=5,
                                       filter_value=0.2):
    t_db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB)

    if debug_mode:
        d_list = d_list[:100]
        id_set = set([item['_id'] for item in d_list])
        sentence_level_results = [
            item for item in sentence_level_results if item['qid'] in id_set
        ]

    d_o_dict = list_dict_data_tool.list_to_dict(d_list, '_id')
    copied_d_o_dict = copy.deepcopy(d_o_dict)
    list_dict_data_tool.append_subfield_from_list_to_dict(
        sentence_level_results, copied_d_o_dict, 'qid', 'fid', check=True)

    cur_results_dict = select_top_k_and_to_results_dict(
        copied_d_o_dict,
        top_k=top_k,
        score_field_name='prob',
        filter_value=filter_value,
        result_field='sp')

    forward_example_items = build_qa_forword_item(cur_results_dict, d_list,
                                                  is_training, t_db_cursor)
    forward_example_items = format_convert(forward_example_items, is_training)
    fitems_dict, read_fitems_list = span_preprocess_tool.eitems_to_fitems(
        forward_example_items, tokenizer, is_training, max_context_length,
        max_query_length, doc_stride, False)

    return fitems_dict, read_fitems_list, cur_results_dict['sp']
コード例 #10
0
def iterative_build_raw_text():
    wiki_whole_db_cursor = get_cursor(str(config.WHOLE_WIKI_DB))
    wiki_whole_db_cursor.execute("SELECT * from unnamed")
    total_count = 0
    cur_count = 0

    with SqliteDict(str(config.WHOLE_WIKI_RAW_TEXT), encode=json.dumps, decode=json.loads) as whole_rindex_db:
        for key, value in tqdm(wiki_whole_db_cursor, total=TOTAL_ARTICLE_COUNT):
            cur_item = json.loads(value)
            raw_text = get_raw_text(cur_item)

            new_item = dict()
            new_item['title'] = cur_item['title']
            new_item['raw_text'] = raw_text
            whole_rindex_db[new_item['title']] = new_item

            cur_count += 1

            if cur_count % 5000 == 0:
                whole_rindex_db.commit()
                # break

        whole_rindex_db.commit()
        whole_rindex_db.close()
コード例 #11
0
ファイル: pylucene_search.py プロジェクト: michaelmoju/nlu_IR
def lucene_indexing():
    lucene.initVM()
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    indexDir = SimpleFSDirectory(Paths.get(str(config.LUCENE_INDEXED)))
    analyzer = PorterStemmerAnalyzer()
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    lprint("Building lucene index ...")
    with SqliteDict(str(config.WHOLE_WIKI_DB),
                    flag='r',
                    encode=json.dumps,
                    decode=json.loads) as whole_wiki_db:
        for key, value in tqdm(whole_tokenized_db_cursor,
                               total=config.TOTAL_ARTICLE_NUMBER_WHOLE):

            item = json.loads(value)
            article_title = item['title']
            article_clean_text = item['clean_text']
            article_poss = item['poss']

            # TODO: change it to extract abstract wiki?
            # get the first paragraph which has the length >= 50? so weired.
            abs_index = get_first_paragraph_index(whole_wiki_db[article_title])

            if abs_index == -1:  # document too short
                valid_page = False

            # only title
            title_term_list = []
            title_poss_list = []

            # only abstract content
            abstract_term_list = []
            abstract_poss_list = []

            assert len(article_clean_text) == len(article_poss)

            for p_i, (paragraph_text, paragraph_poss) in enumerate(
                    zip(article_clean_text, article_poss)):
                for sent_text, sent_poss in zip(paragraph_text,
                                                paragraph_poss):
                    if p_i == 0:  # In title.
                        title_term_list.extend(sent_text)
                        title_poss_list.extend(sent_poss)
                        continue  # If the terms are in title, we don't include those terms in abstract and article term.
                    else:
                        if p_i == abs_index:  # If the terms are in abstract
                            abstract_term_list.extend(sent_text)
                            abstract_poss_list.extend(sent_poss)

            added_title = article_title
            added_text = " ".join(title_term_list + abstract_term_list)

            doc = Document()
            doc.add(Field("title", added_title, StoredField.TYPE))
            doc.add(Field("text", added_text, TextField.TYPE_STORED))
            writer.addDocument(doc)
    writer.close()
コード例 #12
0
def whole_wiki_pages_title_raw_indexing():
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    title_abs_raw_indexdb = IndexDB()
    abs_file_name = config.PDATA_ROOT / "reverse_indexing/abs_rindexdb"

    content_indexdb = IndexDB()
    content_index_file_name = ''

    with SqliteDict(str(config.WHOLE_WIKI_DB),
                    flag='r',
                    encode=json.dumps,
                    decode=json.loads) as whole_wiki_db:
        for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC):
            valid_page = True
            item = json.loads(value)
            # print(item)
            article_title = item['title']
            article_clean_text = item['clean_text']
            article_poss = item['poss']

            abs_index = get_first_paragraph_index(whole_wiki_db[article_title])

            if abs_index == -1:
                valid_page = False

                # print(whole_wiki_db[article_title])
                # This pages is not valid.

            article_term_list = []
            article_poss_list = []

            title_term_list = []
            title_poss_list = []

            abstract_term_list = []
            abstract_poss_list = []

            assert len(article_clean_text) == len(article_poss)

            for p_i, (paragraph_text, paragraph_poss) in enumerate(
                    zip(article_clean_text, article_poss)):
                for sent_text, sent_poss in zip(paragraph_text,
                                                paragraph_poss):
                    if p_i == 0:  # In title.
                        title_term_list.extend(sent_text)
                        title_poss_list.extend(sent_poss)
                        continue  # If the terms are in title, we don't those terms in abstract and article term.
                    else:
                        if p_i == abs_index:  # If the terms are in abstract
                            abstract_term_list.extend(sent_text)
                            abstract_poss_list.extend(sent_poss)

                        article_term_list.extend(sent_text)
                        article_poss_list.extend(sent_poss)

            # print("Title:", title_term_list, title_poss_list)

            title_ngram = get_ngrams(title_term_list,
                                     title_poss_list,
                                     3,
                                     filter_fn=partial(filter_ngram,
                                                       mode='any'),
                                     included_tags=POS_INCLUDED)

            abs_ngram = get_ngrams(abstract_term_list,
                                   abstract_poss_list,
                                   3,
                                   filter_fn=partial(filter_ngram, mode='any'),
                                   included_tags=POS_INCLUDED)

            # print(article_title)
            # print(title_ngram)
            # print(abs_ngram)

            added_terms_num = 0
            for added_term in title_ngram + abs_ngram:
                title_abs_raw_indexdb.inverted_index.add(
                    added_term, article_title)
                added_terms_num += 1

            title_abs_raw_indexdb.document_length_table.add(
                article_title, added_terms_num)
            # break

        #     content_t_ngram = get_ngrams(title_term_list, title_poss_list, 3,
        #                                  filter_fn=partial(filter_ngram, mode='any'),
        #                                  included_tags=POS_INCLUDED)
        #
        #     content_c_ngram = get_ngrams(abstract_term_list, abstract_poss_list, 3,
        #                                  filter_fn=partial(filter_ngram, mode='any'),
        #                                  included_tags=POS_INCLUDED)
        #
        #     added_terms_num = 0
        #     for added_term in content_t_ngram + content_c_ngram:
        #         content_indexdb.inverted_index.add(added_term, article_title)
        #         added_terms_num += 1
        #
        #     content_indexdb.document_length_table.add(article_title, added_terms_num)
        #
        title_abs_raw_indexdb.save_to_file(abs_file_name)
コード例 #13
0
def whole_wiki_pages_title_raw_indexing_paragraph_level_unigram_size_limited_memory_saving(
):
    key_separator = '/'
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    wiki_p_level_indexdb = IndexDB()
    file_name = config.PDATA_ROOT / "reverse_indexing/wiki_p_level_unigram_rindexdb"

    count = 0
    # if limited_terms:
    #     limited_terms_set = load_wiki_abstract_terms(config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt")
    # else:
    #     limited_terms_set = []
    #
    # limited_terms_set = set(limited_terms_set)

    for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC):
        item = json.loads(value)
        article_title = item['title']
        article_clean_text = item['clean_text']
        article_poss = item['poss']

        title_term_list = []
        title_poss_list = []

        title_ngram = None

        assert len(article_clean_text) == len(article_poss)

        for p_i, (paragraph_text, paragraph_poss) in enumerate(
                zip(article_clean_text, article_poss)):
            paragraph_term_list = []
            paragraph_poss_list = []
            for sent_text, sent_poss in zip(paragraph_text, paragraph_poss):
                if p_i == 0:  # In title.
                    title_term_list.extend(sent_text)
                    title_poss_list.extend(sent_poss)
                    continue  # If the terms are in title, we don't those terms in abstract and article term.
                else:  # p_i != 0
                    paragraph_term_list.extend(sent_text)
                    paragraph_poss_list.extend(sent_poss)

            if p_i == 0 and title_ngram is None:
                title_ngram = get_ngrams(title_term_list,
                                         title_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)

            if p_i >= 100:
                break

            paragraph_ngram = get_ngrams(paragraph_term_list,
                                         paragraph_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)

            if len(paragraph_ngram) == 0:
                continue

            added_terms_num = 0

            paragraph_key = key_separator.join((article_title, str(p_i)))

            for added_term in title_ngram + paragraph_ngram:
                # if added_term in limited_terms_set:
                #     wiki_p_level_indexdb.inverted_index.add(added_term, paragraph_key)
                #     added_terms_num += 1
                # elif ' ' not in added_term:
                hash_value_added_term = hash(added_term)
                hash_value_paragraph_key = hash(paragraph_key)
                wiki_p_level_indexdb.inverted_index.add(
                    hash_value_added_term, hash_value_paragraph_key)
                added_terms_num += 1
                # else:
                #     pass

            hash_value_paragraph_key = hash(paragraph_key)
            wiki_p_level_indexdb.document_length_table.add(
                hash_value_paragraph_key, added_terms_num)

            count += 1

        # if count >= 1000:
        #     break

    wiki_p_level_indexdb.save_to_file(file_name, memory_saving=True)
コード例 #14
0
def whole_wiki_pages_title_raw_indexing_article_level_to_indexdb():
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    # wiki_p_level_indexdb = IndexDB()
    file_name = config.PDATA_ROOT / "reverse_indexing/wiki_a_level_persistent_indexdb.db"
    index_db = IndexingDB(file_name)
    index_db.create_tables()

    count = 0

    term_title_items_buffer_list: List[Tuple[str, str, int]] = []
    title_items_buffer_list: List[Tuple[str, int]] = []

    for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC):
        article_term_title_dict: Dict[Tuple[str, str], int] = dict()
        article_title_dict: Dict[str, int] = dict()

        item = json.loads(value)
        article_title = item['title']
        article_clean_text = item['clean_text']
        article_poss = item['poss']

        title_term_list = []
        title_poss_list = []

        title_ngram = None

        article_ngram = []

        assert len(article_clean_text) == len(article_poss)

        for p_i, (paragraph_text, paragraph_poss) in enumerate(
                zip(article_clean_text, article_poss)):
            paragraph_term_list = []
            paragraph_poss_list = []
            for sent_text, sent_poss in zip(paragraph_text, paragraph_poss):
                if p_i == 0:  # In title.
                    title_term_list.extend(sent_text)
                    title_poss_list.extend(sent_poss)
                    continue  # If the terms are in title, we don't those terms in abstract and article term.
                else:  # p_i != 0
                    paragraph_term_list.extend(sent_text)
                    paragraph_poss_list.extend(sent_poss)

            if p_i == 0 and title_ngram is None:
                title_ngram = get_ngrams(title_term_list,
                                         title_poss_list,
                                         2,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)
                continue

            paragraph_ngram = get_ngrams(paragraph_term_list,
                                         paragraph_poss_list,
                                         2,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)

            if len(paragraph_ngram) == 0:
                continue

            article_ngram.extend(paragraph_ngram)

            if p_i >= 60:
                break

        added_terms_num = 0

        for added_term in title_ngram + article_ngram:
            article_term_title_dict[(added_term, article_title)] = \
                article_term_title_dict.get((added_term, article_title), 0) + 1
            added_terms_num += 1

        article_title_dict[article_title] = added_terms_num
        count += 1

        if count >= 200:
            break

        for (term, article_title), ovalue in article_term_title_dict.items():
            term_title_items_buffer_list.append((term, article_title, ovalue))

        for article_title, ovalue in article_title_dict.items():
            title_items_buffer_list.append((article_title, ovalue))

        if len(term_title_items_buffer_list) >= 1000:  # Flush
            index_db.insert_many_items(term_title_items_buffer_list)
            index_db.insert_many_articles(title_items_buffer_list)
            term_title_items_buffer_list = []
            title_items_buffer_list = []

    index_db.insert_many_items(term_title_items_buffer_list)
    index_db.insert_many_articles(title_items_buffer_list)
    index_db.close()
コード例 #15
0
def whole_wiki_pages_title_raw_indexing_article_level(limited_terms=True):
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    wiki_p_level_indexdb = IndexDB()
    file_name = config.PDATA_ROOT / "reverse_indexing/wiki_a_level_limited_gram_rindexdb"

    if limited_terms:
        limited_terms_set = load_wiki_abstract_terms(
            config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt")
    else:
        limited_terms_set = []

    limited_terms_set = set(limited_terms_set)

    count = 0

    for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC):
        item = json.loads(value)
        article_title = item['title']
        article_clean_text = item['clean_text']
        article_poss = item['poss']

        title_term_list = []
        title_poss_list = []

        title_ngram = None

        assert len(article_clean_text) == len(article_poss)

        # article_term_list = []
        # article_poss_list = []
        article_ngram = []

        for p_i, (paragraph_text, paragraph_poss) in enumerate(
                zip(article_clean_text, article_poss)):
            paragraph_term_list = []
            paragraph_poss_list = []
            for sent_text, sent_poss in zip(paragraph_text, paragraph_poss):
                if p_i == 0:  # In title.
                    title_term_list.extend(sent_text)
                    title_poss_list.extend(sent_poss)
                    continue  # If the terms are in title, we don't those terms in abstract and article term.
                else:  # p_i != 0
                    paragraph_term_list.extend(sent_text)
                    paragraph_poss_list.extend(sent_poss)

            if p_i == 0 and title_ngram is None:
                title_ngram = get_ngrams(title_term_list,
                                         title_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)
                continue

            paragraph_ngram = get_ngrams(paragraph_term_list,
                                         paragraph_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)
            if len(paragraph_ngram) == 0:
                continue

            article_ngram.extend(paragraph_ngram)

            if p_i >= 80:
                break

        added_terms_num = 0

        for added_term in title_ngram + article_ngram:
            if added_term in limited_terms_set:
                wiki_p_level_indexdb.inverted_index.add(
                    added_term, article_title)
                added_terms_num += 1
            elif ' ' not in added_term:
                wiki_p_level_indexdb.inverted_index.add(
                    added_term, article_title)
                added_terms_num += 1

        wiki_p_level_indexdb.document_length_table.add(article_title,
                                                       added_terms_num)

        count += 1

        # if count >= 5000:
        #     break

    wiki_p_level_indexdb.save_to_file(file_name)
コード例 #16
0
def init_results_v8(data_list,
                    gt_data_list,
                    terms_based_resutls,
                    g_score_dict,
                    match_filtering_k=3,
                    term_retrieval_top_k=5,
                    multihop_retrieval_top_k=None):
    # 2019-04-06
    # The complete v7 version of retrieval

    ner_set = get_title_entity_set()

    # dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
    print("Total data length:")
    print(len(data_list))

    # We load term-based results
    print("Load term-based results.")
    terms_based_results_dict = dict()
    for item in terms_based_resutls:
        terms_based_results_dict[item['qid']] = item

    # Load tf-idf_score function:
    # g_score_dict = dict()
    # load_from_file(g_score_dict,
    #                config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    keyword_processor = KeywordProcessor(case_sensitive=True)
    keyword_processor_disamb = KeywordProcessor(case_sensitive=True)

    print("Build Processor")
    for kw in tqdm(ner_set):
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            # matched_key_word is the original matched span. we need to save it for group ordering.
            matched_obj = _MatchedObject(matched_key_word=kw,
                                         matched_keywords_info={kw: 'kwm'})
            keyword_processor.add_keyword(kw, matched_obj)
    #
    for kw in wiki_util.title_entities_set.disambiguation_group:
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            if kw in keyword_processor:
                # if the kw existed in the kw_processor, we update its dict to add more disamb items
                existing_matched_obj: _MatchedObject = keyword_processor.get_keyword(
                    kw)
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    if disamb_kw not in existing_matched_obj.matched_keywords_info:
                        existing_matched_obj.matched_keywords_info[
                            disamb_kw] = 'kwm_disamb'
            else:  # If not we add it to the keyword_processor_disamb, which is set to be lower priority
                # new_dict = dict()
                matched_obj = _MatchedObject(matched_key_word=kw,
                                             matched_keywords_info=dict())
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb'
                    # new_dict[disamb_kw] = 'kwm_disamb'
                keyword_processor_disamb.add_keyword(kw, matched_obj)

    doc_pred_dict = {'sp_doc': dict(), 'raw_retrieval_set': dict()}
    # doc_pred_dict_p1 = {'sp_doc': dict(), 'raw_retrieval_set': dict()}

    for item in tqdm(data_list):
        question = item['question']
        qid = item['_id']

        query_terms = get_query_ngrams(question)
        valid_query_terms = [
            term for term in query_terms if term in g_score_dict
        ]

        retrieved_set = RetrievedSet()

        # This method will add the keyword match results in-place to retrieved_set.
        get_kw_matching_results(question, valid_query_terms, retrieved_set,
                                match_filtering_k, g_score_dict,
                                keyword_processor, keyword_processor_disamb)

        # Then we add term-based matching results
        added_count = 0
        for score, title in sorted(terms_based_results_dict[qid]['doc_list'],
                                   key=lambda x: x[0],
                                   reverse=True)[:term_retrieval_top_k + 3]:
            if not filter_word(title) and not filter_document_id(title):
                retrieved_set.add_item(RetrievedItem(title, 'tf-idf'))
                added_count += 1
                if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k:
                    break

        # Add hyperlinked pages:
        finded_keys_set = set(
            retrieved_set.to_id_list()
        )  # for finding hyperlinked pages we do for both keyword matching and disambiguration group.
        # .3 We then add some hyperlinked title
        db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB)

        for keyword_group in finded_keys_set:
            flatten_hyperlinks = []
            hyperlinks = wiki_db_tool.get_first_paragraph_hyperlinks(
                db_cursor, keyword_group)
            for hls in hyperlinks:
                flatten_hyperlinks.extend(hls)

            for hl in flatten_hyperlinks:
                potential_title = hl.href
                if potential_title in ner_set and not filter_word(
                        potential_title) and not filter_document_id(
                            potential_title
                        ):  # important bug fixing 'or' to 'and'
                    # hyperlinked_title.append(potential_title)

                    # if not filter_document_id(potential_title):
                    score = get_query_doc_score(valid_query_terms,
                                                potential_title, g_score_dict)
                    retrieved_set.add_item(
                        retrieval_utils.RetrievedItem(potential_title,
                                                      'kwm_disamb_hlinked'))
                    retrieved_set.score_item(potential_title,
                                             score,
                                             namespace=keyword_group +
                                             '-2-hop')

        for keyword_group in finded_keys_set:
            retrieved_set.sort_and_filter(keyword_group + '-2-hop',
                                          top_k=multihop_retrieval_top_k)

        doc_pred_dict['sp_doc'][qid] = retrieved_set.to_id_list()
        doc_pred_dict['raw_retrieval_set'][qid] = retrieved_set

    if gt_data_list is not None:
        ext_hotpot_eval.eval(doc_pred_dict, gt_data_list)
    return doc_pred_dict
コード例 #17
0
def inspect_oracle_answer_text(append_head=True):
    dev_list = common.load_json(config.DEV_FULLWIKI_FILE)
    db_cursor = wiki_db_tool.get_cursor(config.WHOLE_PROCESS_FOR_RINDEX_DB)

    total, error = 0, 0

    for item in dev_list:
        qid = item['_id']
        query = item['question']
        answer = item['answer']
        o_contexts = item['context']
        supporting_facts = item['supporting_facts']

        # print(query)
        # print(answer)

        supporting_doc = set([fact[0] for fact in item['supporting_facts']])
        selected_fact = []
        sentid2sent_token_dict = dict()

        for doc in supporting_doc:
            # if doc in gt_doc:
            #     continue
            text_item = wiki_db_tool.get_item_by_key(db_cursor, key=doc)
            context = wiki_db_tool.get_first_paragraph_from_clean_text_item(
                text_item, flatten_to_paragraph=False, skip_first=True)
            for i, sentence_token in enumerate(context):
                # sentence_text = sentence_token
                if len(sentence_token) != 0:
                    selected_fact.append([doc, i])
                    sentid2sent_token_dict[(doc, i)] = sentence_token

        # shuffle doc ordering.
        supporting_doc = list(supporting_doc)
        random.shuffle(supporting_doc)
        # end

        shuffled_supporting_fact_list = []
        supporting_facts = sorted(supporting_facts, key=lambda x: (x[0], x[1]))
        for doc in supporting_doc:
            for fact in supporting_facts:
                if fact[0] == doc:
                    shuffled_supporting_fact_list.append(fact)

        assert len(shuffled_supporting_fact_list) == len(supporting_facts)

        # print(supporting_facts)
        # print(shuffled_supporting_fact_list)
        #
        # print("Sup Fact.")
        cur_doc = None
        context_token_list = []
        for doc, i in shuffled_supporting_fact_list:
            if (doc, i) not in sentid2sent_token_dict:
                print(f"Potential Error: {(doc, i)} not exists in DB.")
                continue
            # print((doc, i), sentid2sent_token_dict[(doc, i)])
            paragraph_token_list = sentid2sent_token_dict[(doc, i)]

            if cur_doc != doc and append_head and i != 0:
                context_token_list = context_token_list + doc.split(' ') + [
                    '.'
                ] + paragraph_token_list
            else:
                context_token_list += paragraph_token_list

        # print(context_token_list)
        context_matcher = ContextAnswerMatcher(context_token_list, uncase=True)
        context, answer_start_list = context_matcher.concate_and_return_answer_index(
            answer, match_type='left')

        if len(answer_start_list) > 1:
            error += 1

        if len(answer_start_list) == 0 and answer != 'yes' and answer != 'no':
            print("Error")
            print("Query:", query)
            print("Answer:", answer)
            print("Sp fact:", shuffled_supporting_fact_list)
            print("Context:", context)

            context_matcher = ContextAnswerMatcher(context_token_list,
                                                   uncase=True)
            context, answer_start_list = context_matcher.concate_and_return_answer_index(
                answer)

        # print(sentid2sent_token_dict)

        # for title, number in supporting_facts:
        #     print(title, number)
        total += 1

    print(error, total)