def generate_partial_subindex_for_batch(batch_id: int) -> dict:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text'])
    filtered_wiki_pages = filter_documents(all_articles)
    if args.debug:
        filtered_wiki_pages = filtered_wiki_pages[:42]

    subindex = {}
    for raw_document in filtered_wiki_pages.iterrows():
        page_id = raw_document[1]['id']
        filtered_tokens = process_normalise_tokenise_filter(
            raw_document[1]['text'])
        words_counter = Counter(filtered_tokens)
        # Invert word -> doc and add raw and relative term count
        for count in words_counter.items():
            term = count[0]
            raw_count = count[1]
            tf = raw_count if args.variant == 'raw_count' else raw_count / len(
                filtered_tokens)
            idf = words_with_idf.loc[term]['idf']
            tfidf = tf * idf
            subindex.setdefault(term, {'docs': []})['docs'].append(
                (page_id, raw_count, tfidf))

    print('Finished processing batch #{}'.format(batch_id))
    return subindex
def process_count_batch(batch_id: int) -> Counter:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text'])
    filtered_articles = filter_documents(all_articles)
    article_texts = reduce_document_to_text_column(filtered_articles)

    combined_tokens = []
    for raw_article in article_texts:
        filtered_tokens = process_normalise_tokenise_filter(raw_article)
        combined_tokens.extend(filtered_tokens)
    return get_word_counts(combined_tokens)
Esempio n. 3
0
def generate_batch_mappings(batch_id: int):
    print('Processing batch #{}...'.format(batch_id))
    parital_result = {}

    batch_file_path = get_wiki_batch_path(batch_id)
    batch_df = read_jsonl_and_map_to_df(batch_file_path, ['id'])
    for line_index, row in batch_df.iterrows():
        page_id = row[0]
        # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé'
        page_id = unicodedata.normalize('NFC', page_id)
        parital_result[page_id] = (batch_id, line_index)

    return parital_result
Esempio n. 4
0
def generate_document_length_mapping_for_batch(batch_id: int) -> dict:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text'])
    filtered_articles = filter_documents(all_articles)
    # filtered_articles.set_index('id', drop=False)
    if (args.debug):
        filtered_articles = filtered_articles.head(n=3)

    partial_document_length_mappings = {}
    for raw_doc_row in filtered_articles.iterrows():
        page_id = raw_doc_row[1]['id']
        filtered_tokens = process_normalise_tokenise_filter(
            raw_doc_row[1]['text'])
        partial_document_length_mappings[page_id] = len(filtered_tokens)

    return partial_document_length_mappings
Esempio n. 5
0
def retrieve_wiki_page(page_id: str) -> WikiDocument:
    page_id = page_id.strip()
    # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé'
    page_id = unicodedata.normalize('NFC', page_id)

    # Find correct batch file and read only relevant line
    batch_id, line = wiki_page_mapping.loc[page_id].values
    wiki_batch_path = get_wiki_batch_path(batch_id)

    with open(wiki_batch_path) as fp:
        for i, json_line in enumerate(fp):
            if i == line:
                return WikiDocument(json_line)

    # If this code runs, a mapping error occured
    print(colored('Error: Line {} not found in wiki-page {}'.format(line, batch_id), 'red'))
Esempio n. 6
0
def count_documents_batch(batch_id: int) -> int:
    batch_file_path = get_wiki_batch_path(batch_id)
    all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text'])
    filtered_articles = filter_documents(all_articles)
    return len(filtered_articles)