def generate_partial_subindex_for_batch(batch_id: int) -> dict:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text'])
    filtered_wiki_pages = filter_documents(all_articles)
    if args.debug:
        filtered_wiki_pages = filtered_wiki_pages[:42]

    subindex = {}
    for raw_document in filtered_wiki_pages.iterrows():
        page_id = raw_document[1]['id']
        filtered_tokens = process_normalise_tokenise_filter(
            raw_document[1]['text'])
        words_counter = Counter(filtered_tokens)
        # Invert word -> doc and add raw and relative term count
        for count in words_counter.items():
            term = count[0]
            raw_count = count[1]
            tf = raw_count if args.variant == 'raw_count' else raw_count / len(
                filtered_tokens)
            idf = words_with_idf.loc[term]['idf']
            tfidf = tf * idf
            subindex.setdefault(term, {'docs': []})['docs'].append(
                (page_id, raw_count, tfidf))

    print('Finished processing batch #{}'.format(batch_id))
    return subindex
def process_count_batch(batch_id: int) -> Counter:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text'])
    filtered_articles = filter_documents(all_articles)
    article_texts = reduce_document_to_text_column(filtered_articles)

    combined_tokens = []
    for raw_article in article_texts:
        filtered_tokens = process_normalise_tokenise_filter(raw_article)
        combined_tokens.extend(filtered_tokens)
    return get_word_counts(combined_tokens)
Beispiel #3
0
def generate_batch_mappings(batch_id: int):
    print('Processing batch #{}...'.format(batch_id))
    parital_result = {}

    batch_file_path = get_wiki_batch_path(batch_id)
    batch_df = read_jsonl_and_map_to_df(batch_file_path, ['id'])
    for line_index, row in batch_df.iterrows():
        page_id = row[0]
        # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé'
        page_id = unicodedata.normalize('NFC', page_id)
        parital_result[page_id] = (batch_id, line_index)

    return parital_result
Beispiel #4
0
def generate_document_length_mapping_for_batch(batch_id: int) -> dict:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text'])
    filtered_articles = filter_documents(all_articles)
    # filtered_articles.set_index('id', drop=False)
    if (args.debug):
        filtered_articles = filtered_articles.head(n=3)

    partial_document_length_mappings = {}
    for raw_doc_row in filtered_articles.iterrows():
        page_id = raw_doc_row[1]['id']
        filtered_tokens = process_normalise_tokenise_filter(
            raw_doc_row[1]['text'])
        partial_document_length_mappings[page_id] = len(filtered_tokens)

    return partial_document_length_mappings
def process_generate_df_batch(id: int) -> Counter:
    colour = TERM_COLOURS[id % len(TERM_COLOURS)]
    print(
        colored('Start processing batch #{}'.format(id),
                colour,
                attrs=['bold']))

    start_time = time.time()

    batch_file_path = '{}wiki-{:03}.jsonl'.format(DATA_WIKI_PATH, id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text'])
    filtered_articles = filter_documents(all_articles)
    article_texts = reduce_document_to_text_column(filtered_articles)

    accumulated_batch_idfs = Counter()

    for index, raw_article in enumerate(article_texts):
        filtered_tokens = process_normalise_tokenise_filter(raw_article)
        # use set to prevent multiple occurrences of word in doc
        words_set = set(filtered_tokens)

        if (index % 5000 == 0):
            print(
                colored(
                    'Processing document [{} / {}] of batch #{}...'.format(
                        index, len(article_texts), id), colour))

        # count for included words will be one
        words_in_doc = Counter(words_set)
        accumulated_batch_idfs += words_in_doc

    print(
        colored('Finished processing batch #{} after {:.2f} seconds'.format(
            id,
            time.time() - start_time),
                colour,
                attrs=['bold']))
    return accumulated_batch_idfs
        word = word_count[0]
        df = word_count[1]
        idf = math.log10(COLLECTION_DOCUMENTS_NUMBER / df)
        result.append((word, idf))
    return result


def export_result(result: list):
    write_list_to_jsonl(GENERATED_IDF_PATH, result)


if __name__ == '__main__':
    start_time = time.time()

    words_with_df = generate_df_all()
    print(
        colored('Counted frequencies of {:,} words'.format(len(words_with_df)),
                attrs=['bold']))

    words_with_idf = get_words_with_idf(words_with_df)
    print('Added inverse document frequencies')

    print('Top 10 extract: {}'.format(words_with_idf[0:10]))
    print('Finished processing after {:.2f} seconds'.format(time.time() -
                                                            start_time))
    export_result(words_with_idf)

    # Vocabulary size should be equal from the frequency count in task #1
    vocabulary = read_jsonl_and_map_to_df(GENERATED_COUNTS_PATH)[0]
    assert (len(vocabulary) == len(words_with_idf))
Beispiel #7
0
def count_documents_batch(batch_id: int) -> int:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text'])
    filtered_articles = filter_documents(all_articles)
    return len(filtered_articles)
import math

import matplotlib.pyplot as plt
from scipy import stats

from dataaccess.files_constants import GENERATED_COUNTS_PATH
from dataaccess.files_io import read_jsonl_and_map_to_df
from util.plots import show_plot_and_save_figure, prepare_seaborn_plots

if __name__ == '__main__':
    counts = read_jsonl_and_map_to_df(GENERATED_COUNTS_PATH)

    x_ranks = range(1, len(counts[1]) + 1)
    y_counts = [count for count in counts[1]]
    x_ranks_log = [math.log10(rank) for rank in x_ranks]
    y_counts_log = [math.log10(count) for count in y_counts]

    slope, intercept, r_value, p_value, std_err = stats.linregress(
        x_ranks_log, y_counts_log)
    r_squared = r_value**2
    print('slope: {}; intercept: {}; r-squared: {}, p: {}'.format(
        slope, intercept, r_squared, p_value))

    prepare_seaborn_plots()
    plt.plot(x_ranks_log, y_counts_log, 'o')
    plt.plot(x_ranks_log, [intercept + slope * rank for rank in x_ranks_log],
             'red')

    plt.xlabel('log(rank)')
    plt.ylabel('log(frequency)')
    plt.figtext(0.2, 0.45, 'R$^2$ = {:.5f}'.format(r_squared))
import pandas as pd

from dataaccess.files_constants import DATA_TRAINING_PATH, DATA_DEV_LABELED_PATH, DATA_TEST_UNLABELED_PATH
from dataaccess.files_io import read_jsonl_and_map_to_df
from documentretrieval.data_constants import CLAIMS_COLUMNS_LABELED, CLAIMS_COLUMNS_UNLABELED

claims_training = read_jsonl_and_map_to_df(
    DATA_TRAINING_PATH, CLAIMS_COLUMNS_LABELED).set_index('id', drop=False)
claims_dev = read_jsonl_and_map_to_df(
    DATA_DEV_LABELED_PATH, CLAIMS_COLUMNS_LABELED).set_index('id', drop=False)
claims_test = read_jsonl_and_map_to_df(
    DATA_TEST_UNLABELED_PATH, CLAIMS_COLUMNS_UNLABELED).set_index('id',
                                                                  drop=False)


def get_claim(id: int, dataset: str = 'train') -> str:
    return get_corresponding_dataset(dataset).loc[id]['claim']


def get_claim_row(id: int, dataset: str = 'train') -> pd.Series:
    return get_corresponding_dataset(dataset).loc[id]


def get_all_claims(dataset: str = 'train') -> pd.DataFrame:
    return get_corresponding_dataset(dataset)


def claim_is_verifiable(claim_id: int, dataset: str = 'train') -> bool:
    return get_corresponding_dataset(
        dataset).loc[claim_id]['verifiable'] == 'VERIFIABLE'
from dataaccess.files_constants import GENERATED_DOCUMENT_LENGTH_MAPPING
from dataaccess.files_io import read_jsonl_and_map_to_df

doc_length_mapping = read_jsonl_and_map_to_df(GENERATED_DOCUMENT_LENGTH_MAPPING, ['page_id', 'length']).set_index('page_id', drop=False)

def get_length_of_doc(page_id: str) -> int:
    return doc_length_mapping.loc[page_id]['length']
Beispiel #11
0
from dataaccess.files_constants import GENERATED_DOCUMENT_NORMS_MAPPING
from dataaccess.files_io import read_jsonl_and_map_to_df

docs_norms = read_jsonl_and_map_to_df(GENERATED_DOCUMENT_NORMS_MAPPING,
                                      ['doc', 'norm']).set_index('doc',
                                                                 drop=False)


def get_norm_for_doc_text(page_id: str) -> float:
    return docs_norms.loc[page_id]['norm']
import pandas as pd

from dataaccess.files_constants import GENERATED_COUNTS_PATH
from documentretrieval.data_constants import COLLECTION_TOTAL_WORDS
from dataaccess.files_io import read_jsonl_and_map_to_df

terms_with_occurrences = read_jsonl_and_map_to_df(GENERATED_COUNTS_PATH, ['term', 'occurrences']).set_index('term', drop=False)


def get_collection_occurrences_of_term(term: str) -> int:
    return terms_with_occurrences.loc[term]['occurrences']


def get_collection_probability_for_term(term: str) -> float:
    occurences = get_collection_occurrences_of_term(term)
    return occurences / COLLECTION_TOTAL_WORDS


def get_terms_with_occurrences_mapping() -> pd.DataFrame:
    return terms_with_occurrences
Beispiel #13
0
from dataaccess.files_constants import GENERATED_IDF_PATH
from dataaccess.files_io import read_jsonl_and_map_to_df

words_with_idf = read_jsonl_and_map_to_df(
    GENERATED_IDF_PATH, ['word', 'idf']).set_index('word', drop=False)


def get_idf_for_term(term: str) -> float:
    try:
        return words_with_idf.loc[term]['idf']
    except KeyError:
        # this can happen for tokens from doc titles, as the IDF values are only generated for doc text
        return 0