Beispiel #1
0
def toy_init_results():
    dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
    print(len(dev_fullwiki_list))

    # Load rindex file
    abs_rindexdb = IndexDB()
    abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb")
    print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    abs_rindexdb.inverted_index.build_Nt_table()
    abs_rindexdb.score_db['default-tf-idf'] = dict()
    load_from_file(abs_rindexdb.score_db['default-tf-idf'],
                   config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    # Load rindex finished

    saved_items = []
    for item in tqdm(dev_fullwiki_list):
        saved_tfidf_item = dict()
        question = item['question']
        qid = item['_id']

        doc_list = get_top_ranked_tf_idf_doc(question, abs_rindexdb, top_k=50)
        saved_tfidf_item['question'] = question
        saved_tfidf_item['qid'] = qid
        saved_tfidf_item['doc_list'] = doc_list

        saved_items.append(saved_tfidf_item)

    common.save_jsonl(saved_items, config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
Beispiel #2
0
def sanity_check():
    # pre_compute_abs_if_idf_scores()
    #
    abs_rindexdb = IndexDB()
    abs_rindexdb.load_from_file(config.PDATA_ROOT /
                                "reverse_indexing/abs_rindexdb")
    print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    abs_rindexdb.inverted_index.build_Nt_table()
    abs_rindexdb.score_db['default-tf-idf'] = dict()
    load_from_file(
        abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    # # exit(0)
    #
    # abs_rindexdb.pre_compute_scores()
    # save_to_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    # exit(0)

    query = "What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?"
    tokens = [t.text for t in nlp(query)]
    # poss = [t.text for t in nlp(query)]
    query_ngrams = get_ngrams(tokens,
                              None,
                              3,
                              filter_fn=partial(filter_ngram, mode='any'),
                              included_tags=None)

    # print(query_ngram)
    candidate_pages_set = set()
    valid_terms = []
    for q_ngram in query_ngrams:
        candidate_pages = abs_rindexdb.inverted_index.get_containing_document(
            q_ngram)
        if candidate_pages is not None:
            valid_terms.append(q_ngram)
            candidate_pages_set |= candidate_pages

    print('Animorphs' in candidate_pages_set)
    print(abs_rindexdb.get_relevant_document(['Animorphs'], valid_terms))
    doc_list = abs_rindexdb.get_relevant_document(candidate_pages_set,
                                                  valid_terms,
                                                  top_k=100)

    # print(candidate_pages_set)
    print(query_ngrams)
    print(len(candidate_pages_set))
    print(doc_list)
Beispiel #3
0
def pre_compute_abs_if_idf_scores():
    abs_rindexdb = IndexDB()
    abs_rindexdb.load_from_file(config.PDATA_ROOT /
                                "reverse_indexing/abs_rindexdb")
    print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    abs_rindexdb.inverted_index.build_Nt_table()
    # exit(0)

    abs_rindexdb.pre_compute_scores()
    save_to_file(
        abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
Beispiel #4
0
def compute_abs_score(db_path,
                      score_path="scored_db/default-tf-idf.score.txt",
                      with_int_type=False,
                      memory_efficient=False,
                      iteratively=False):
    abs_rindexdb = IndexDB()
    abs_rindexdb.load_from_file(db_path,
                                with_int_type,
                                memory_saving=memory_efficient)
    print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    abs_rindexdb.inverted_index.build_Nt_table()

    if not iteratively:
        abs_rindexdb.pre_compute_scores()
        if not (Path(db_path) / score_path).parent.is_dir():
            (Path(db_path) / score_path).parent.mkdir()

        save_to_file(abs_rindexdb.score_db['default-tf-idf'],
                     Path(db_path) / score_path,
                     memory_efficient=memory_efficient)
    else:
        if not (Path(db_path) / score_path).parent.is_dir():
            (Path(db_path) / score_path).parent.mkdir()
        abs_rindexdb.pre_compute_scores_iteratively(Path(db_path) / score_path)
def whole_wiki_pages_title_raw_indexing_article_level(limited_terms=True):
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    wiki_p_level_indexdb = IndexDB()
    file_name = config.PDATA_ROOT / "reverse_indexing/wiki_a_level_limited_gram_rindexdb"

    if limited_terms:
        limited_terms_set = load_wiki_abstract_terms(
            config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt")
    else:
        limited_terms_set = []

    limited_terms_set = set(limited_terms_set)

    count = 0

    for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC):
        item = json.loads(value)
        article_title = item['title']
        article_clean_text = item['clean_text']
        article_poss = item['poss']

        title_term_list = []
        title_poss_list = []

        title_ngram = None

        assert len(article_clean_text) == len(article_poss)

        # article_term_list = []
        # article_poss_list = []
        article_ngram = []

        for p_i, (paragraph_text, paragraph_poss) in enumerate(
                zip(article_clean_text, article_poss)):
            paragraph_term_list = []
            paragraph_poss_list = []
            for sent_text, sent_poss in zip(paragraph_text, paragraph_poss):
                if p_i == 0:  # In title.
                    title_term_list.extend(sent_text)
                    title_poss_list.extend(sent_poss)
                    continue  # If the terms are in title, we don't those terms in abstract and article term.
                else:  # p_i != 0
                    paragraph_term_list.extend(sent_text)
                    paragraph_poss_list.extend(sent_poss)

            if p_i == 0 and title_ngram is None:
                title_ngram = get_ngrams(title_term_list,
                                         title_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)
                continue

            paragraph_ngram = get_ngrams(paragraph_term_list,
                                         paragraph_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)
            if len(paragraph_ngram) == 0:
                continue

            article_ngram.extend(paragraph_ngram)

            if p_i >= 80:
                break

        added_terms_num = 0

        for added_term in title_ngram + article_ngram:
            if added_term in limited_terms_set:
                wiki_p_level_indexdb.inverted_index.add(
                    added_term, article_title)
                added_terms_num += 1
            elif ' ' not in added_term:
                wiki_p_level_indexdb.inverted_index.add(
                    added_term, article_title)
                added_terms_num += 1

        wiki_p_level_indexdb.document_length_table.add(article_title,
                                                       added_terms_num)

        count += 1

        # if count >= 5000:
        #     break

    wiki_p_level_indexdb.save_to_file(file_name)
def whole_wiki_pages_title_raw_indexing_paragraph_level_unigram_size_limited_memory_saving(
):
    key_separator = '/'
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    wiki_p_level_indexdb = IndexDB()
    file_name = config.PDATA_ROOT / "reverse_indexing/wiki_p_level_unigram_rindexdb"

    count = 0
    # if limited_terms:
    #     limited_terms_set = load_wiki_abstract_terms(config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt")
    # else:
    #     limited_terms_set = []
    #
    # limited_terms_set = set(limited_terms_set)

    for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC):
        item = json.loads(value)
        article_title = item['title']
        article_clean_text = item['clean_text']
        article_poss = item['poss']

        title_term_list = []
        title_poss_list = []

        title_ngram = None

        assert len(article_clean_text) == len(article_poss)

        for p_i, (paragraph_text, paragraph_poss) in enumerate(
                zip(article_clean_text, article_poss)):
            paragraph_term_list = []
            paragraph_poss_list = []
            for sent_text, sent_poss in zip(paragraph_text, paragraph_poss):
                if p_i == 0:  # In title.
                    title_term_list.extend(sent_text)
                    title_poss_list.extend(sent_poss)
                    continue  # If the terms are in title, we don't those terms in abstract and article term.
                else:  # p_i != 0
                    paragraph_term_list.extend(sent_text)
                    paragraph_poss_list.extend(sent_poss)

            if p_i == 0 and title_ngram is None:
                title_ngram = get_ngrams(title_term_list,
                                         title_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)

            if p_i >= 100:
                break

            paragraph_ngram = get_ngrams(paragraph_term_list,
                                         paragraph_poss_list,
                                         1,
                                         filter_fn=partial(filter_ngram,
                                                           mode='any'),
                                         included_tags=POS_INCLUDED)

            if len(paragraph_ngram) == 0:
                continue

            added_terms_num = 0

            paragraph_key = key_separator.join((article_title, str(p_i)))

            for added_term in title_ngram + paragraph_ngram:
                # if added_term in limited_terms_set:
                #     wiki_p_level_indexdb.inverted_index.add(added_term, paragraph_key)
                #     added_terms_num += 1
                # elif ' ' not in added_term:
                hash_value_added_term = hash(added_term)
                hash_value_paragraph_key = hash(paragraph_key)
                wiki_p_level_indexdb.inverted_index.add(
                    hash_value_added_term, hash_value_paragraph_key)
                added_terms_num += 1
                # else:
                #     pass

            hash_value_paragraph_key = hash(paragraph_key)
            wiki_p_level_indexdb.document_length_table.add(
                hash_value_paragraph_key, added_terms_num)

            count += 1

        # if count >= 1000:
        #     break

    wiki_p_level_indexdb.save_to_file(file_name, memory_saving=True)
def whole_wiki_pages_title_raw_indexing():
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    title_abs_raw_indexdb = IndexDB()
    abs_file_name = config.PDATA_ROOT / "reverse_indexing/abs_rindexdb"

    content_indexdb = IndexDB()
    content_index_file_name = ''

    with SqliteDict(str(config.WHOLE_WIKI_DB),
                    flag='r',
                    encode=json.dumps,
                    decode=json.loads) as whole_wiki_db:
        for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC):
            valid_page = True
            item = json.loads(value)
            # print(item)
            article_title = item['title']
            article_clean_text = item['clean_text']
            article_poss = item['poss']

            abs_index = get_first_paragraph_index(whole_wiki_db[article_title])

            if abs_index == -1:
                valid_page = False

                # print(whole_wiki_db[article_title])
                # This pages is not valid.

            article_term_list = []
            article_poss_list = []

            title_term_list = []
            title_poss_list = []

            abstract_term_list = []
            abstract_poss_list = []

            assert len(article_clean_text) == len(article_poss)

            for p_i, (paragraph_text, paragraph_poss) in enumerate(
                    zip(article_clean_text, article_poss)):
                for sent_text, sent_poss in zip(paragraph_text,
                                                paragraph_poss):
                    if p_i == 0:  # In title.
                        title_term_list.extend(sent_text)
                        title_poss_list.extend(sent_poss)
                        continue  # If the terms are in title, we don't those terms in abstract and article term.
                    else:
                        if p_i == abs_index:  # If the terms are in abstract
                            abstract_term_list.extend(sent_text)
                            abstract_poss_list.extend(sent_poss)

                        article_term_list.extend(sent_text)
                        article_poss_list.extend(sent_poss)

            # print("Title:", title_term_list, title_poss_list)

            title_ngram = get_ngrams(title_term_list,
                                     title_poss_list,
                                     3,
                                     filter_fn=partial(filter_ngram,
                                                       mode='any'),
                                     included_tags=POS_INCLUDED)

            abs_ngram = get_ngrams(abstract_term_list,
                                   abstract_poss_list,
                                   3,
                                   filter_fn=partial(filter_ngram, mode='any'),
                                   included_tags=POS_INCLUDED)

            # print(article_title)
            # print(title_ngram)
            # print(abs_ngram)

            added_terms_num = 0
            for added_term in title_ngram + abs_ngram:
                title_abs_raw_indexdb.inverted_index.add(
                    added_term, article_title)
                added_terms_num += 1

            title_abs_raw_indexdb.document_length_table.add(
                article_title, added_terms_num)
            # break

        #     content_t_ngram = get_ngrams(title_term_list, title_poss_list, 3,
        #                                  filter_fn=partial(filter_ngram, mode='any'),
        #                                  included_tags=POS_INCLUDED)
        #
        #     content_c_ngram = get_ngrams(abstract_term_list, abstract_poss_list, 3,
        #                                  filter_fn=partial(filter_ngram, mode='any'),
        #                                  included_tags=POS_INCLUDED)
        #
        #     added_terms_num = 0
        #     for added_term in content_t_ngram + content_c_ngram:
        #         content_indexdb.inverted_index.add(added_term, article_title)
        #         added_terms_num += 1
        #
        #     content_indexdb.document_length_table.add(article_title, added_terms_num)
        #
        title_abs_raw_indexdb.save_to_file(abs_file_name)