コード例 #1
0
def generate_partial_subindex_for_batch(batch_id: int) -> dict:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text'])
    filtered_wiki_pages = filter_documents(all_articles)
    if args.debug:
        filtered_wiki_pages = filtered_wiki_pages[:42]

    subindex = {}
    for raw_document in filtered_wiki_pages.iterrows():
        page_id = raw_document[1]['id']
        filtered_tokens = process_normalise_tokenise_filter(
            raw_document[1]['text'])
        words_counter = Counter(filtered_tokens)
        # Invert word -> doc and add raw and relative term count
        for count in words_counter.items():
            term = count[0]
            raw_count = count[1]
            tf = raw_count if args.variant == 'raw_count' else raw_count / len(
                filtered_tokens)
            idf = words_with_idf.loc[term]['idf']
            tfidf = tf * idf
            subindex.setdefault(term, {'docs': []})['docs'].append(
                (page_id, raw_count, tfidf))

    print('Finished processing batch #{}'.format(batch_id))
    return subindex
コード例 #2
0
def process_count_batch(batch_id: int) -> Counter:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text'])
    filtered_articles = filter_documents(all_articles)
    article_texts = reduce_document_to_text_column(filtered_articles)

    combined_tokens = []
    for raw_article in article_texts:
        filtered_tokens = process_normalise_tokenise_filter(raw_article)
        combined_tokens.extend(filtered_tokens)
    return get_word_counts(combined_tokens)
コード例 #3
0
def generate_document_length_mapping_for_batch(batch_id: int) -> dict:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text'])
    filtered_articles = filter_documents(all_articles)
    # filtered_articles.set_index('id', drop=False)
    if (args.debug):
        filtered_articles = filtered_articles.head(n=3)

    partial_document_length_mappings = {}
    for raw_doc_row in filtered_articles.iterrows():
        page_id = raw_doc_row[1]['id']
        filtered_tokens = process_normalise_tokenise_filter(
            raw_doc_row[1]['text'])
        partial_document_length_mappings[page_id] = len(filtered_tokens)

    return partial_document_length_mappings
def retrieve_documents_for_claim(claim: str, claim_id: int):
    print(
        colored('Retrieving documents for claim [{}]: "{}"'.format(
            claim_id, claim),
                attrs=['bold']))
    preprocessed_claim = preprocess_claim_text(claim)
    claim_terms = process_normalise_tokenise_filter(preprocessed_claim)

    # only docs that appear in index for at least one claim term to be considered
    doc_candidates = get_candidate_documents_for_claim(claim_terms,
                                                       mode='raw_count')

    scoring_function = get_query_likelihood_score_no_smoothing
    if args.smoothing == 'laplace':
        scoring_function = get_query_likelihood_score_laplace_smoothing
    if args.smoothing == 'laplace_lindstone':
        scoring_function = get_query_likelihood_score_laplace_lindstone_smoothing
    if args.smoothing == 'jelinek_mercer':
        scoring_function = get_query_likelihood_score_jelinek_mercer_smoothing
    if args.smoothing == 'dirichlet':
        scoring_function = get_query_likelihood_score_dirichlet_smoothing

    # query likelihood scores for each claim-doc combination
    docs_with_query_likelihood_scores = [
        scoring_function(claim_terms, doc_with_terms)
        for doc_with_terms in doc_candidates.items()
    ]

    # zero values lead to random retrievals if all documents evaluate to zero, so might rather want to show no results
    if (args.remove_zero_likelihood):
        docs_with_query_likelihood_scores = list(
            filter(lambda x: x[1] != 0, docs_with_query_likelihood_scores))

    # sort by query likelihood and limit to top results
    docs_with_query_likelihood_scores.sort(key=itemgetter(1), reverse=True)
    result_docs = docs_with_query_likelihood_scores[:
                                                    DOCS_TO_RETRIEVE_PER_CLAIM]

    result_directory = '{}{}/'.format(RETRIEVED_PROBABILISTIC_DIRECTORY,
                                      args.smoothing or 'no_smoothing')
    display_or_store_result(claim, claim_id, result_docs, result_directory,
                            args.print)
コード例 #5
0
def retrieve_documents_for_claim(claim: str, claim_id: int):
    print(colored('Retrieving documents for claim [{}]: "{}"'.format(claim_id, claim), attrs=['bold']))
    preprocessed_claim = preprocess_claim_text(claim)
    claim_terms = process_normalise_tokenise_filter(preprocessed_claim)
    claim_vector = get_tfidf_vector_for_claim(claim_terms)
    claim_norm = get_tfidf_vector_norm(claim_terms, args.variant)

    # only docs that appear in index for at least one claim term to be considered
    doc_candidates = get_candidate_documents_for_claim(claim_terms)

    # similarity scores for each claim-doc combination
    docs_with_similarity_scores = [
        scoring_function(claim_terms, claim_vector, claim_norm, doc_with_terms) for doc_with_terms in
        doc_candidates.items()]

    # sort by similarity and limit to top results
    docs_with_similarity_scores.sort(key=itemgetter(1), reverse=True)
    result_docs = docs_with_similarity_scores[:DOCS_TO_RETRIEVE_PER_CLAIM]

    display_or_store_result(claim, claim_id, result_docs, RETRIEVED_TFIDF_DIRECTORY, args.print)
コード例 #6
0
def process_generate_df_batch(id: int) -> Counter:
    colour = TERM_COLOURS[id % len(TERM_COLOURS)]
    print(
        colored('Start processing batch #{}'.format(id),
                colour,
                attrs=['bold']))

    start_time = time.time()

    batch_file_path = '{}wiki-{:03}.jsonl'.format(DATA_WIKI_PATH, id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text'])
    filtered_articles = filter_documents(all_articles)
    article_texts = reduce_document_to_text_column(filtered_articles)

    accumulated_batch_idfs = Counter()

    for index, raw_article in enumerate(article_texts):
        filtered_tokens = process_normalise_tokenise_filter(raw_article)
        # use set to prevent multiple occurrences of word in doc
        words_set = set(filtered_tokens)

        if (index % 5000 == 0):
            print(
                colored(
                    'Processing document [{} / {}] of batch #{}...'.format(
                        index, len(article_texts), id), colour))

        # count for included words will be one
        words_in_doc = Counter(words_set)
        accumulated_batch_idfs += words_in_doc

    print(
        colored('Finished processing batch #{} after {:.2f} seconds'.format(
            id,
            time.time() - start_time),
                colour,
                attrs=['bold']))
    return accumulated_batch_idfs
コード例 #7
0
def preprocess_doc_title(page_id: str) -> list:
    doc_title_preprocessed = add_padding_around_punctuation(page_id).replace(
        '_', ' ')
    return process_normalise_tokenise_filter(doc_title_preprocessed)