def generate_partial_subindex_for_batch(batch_id: int) -> dict: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text']) filtered_wiki_pages = filter_documents(all_articles) if args.debug: filtered_wiki_pages = filtered_wiki_pages[:42] subindex = {} for raw_document in filtered_wiki_pages.iterrows(): page_id = raw_document[1]['id'] filtered_tokens = process_normalise_tokenise_filter( raw_document[1]['text']) words_counter = Counter(filtered_tokens) # Invert word -> doc and add raw and relative term count for count in words_counter.items(): term = count[0] raw_count = count[1] tf = raw_count if args.variant == 'raw_count' else raw_count / len( filtered_tokens) idf = words_with_idf.loc[term]['idf'] tfidf = tf * idf subindex.setdefault(term, {'docs': []})['docs'].append( (page_id, raw_count, tfidf)) print('Finished processing batch #{}'.format(batch_id)) return subindex
def process_count_batch(batch_id: int) -> Counter: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text']) filtered_articles = filter_documents(all_articles) article_texts = reduce_document_to_text_column(filtered_articles) combined_tokens = [] for raw_article in article_texts: filtered_tokens = process_normalise_tokenise_filter(raw_article) combined_tokens.extend(filtered_tokens) return get_word_counts(combined_tokens)
def generate_document_length_mapping_for_batch(batch_id: int) -> dict: batch_file_path = get_wiki_batch_path(batch_id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['id', 'text']) filtered_articles = filter_documents(all_articles) # filtered_articles.set_index('id', drop=False) if (args.debug): filtered_articles = filtered_articles.head(n=3) partial_document_length_mappings = {} for raw_doc_row in filtered_articles.iterrows(): page_id = raw_doc_row[1]['id'] filtered_tokens = process_normalise_tokenise_filter( raw_doc_row[1]['text']) partial_document_length_mappings[page_id] = len(filtered_tokens) return partial_document_length_mappings
def retrieve_documents_for_claim(claim: str, claim_id: int): print( colored('Retrieving documents for claim [{}]: "{}"'.format( claim_id, claim), attrs=['bold'])) preprocessed_claim = preprocess_claim_text(claim) claim_terms = process_normalise_tokenise_filter(preprocessed_claim) # only docs that appear in index for at least one claim term to be considered doc_candidates = get_candidate_documents_for_claim(claim_terms, mode='raw_count') scoring_function = get_query_likelihood_score_no_smoothing if args.smoothing == 'laplace': scoring_function = get_query_likelihood_score_laplace_smoothing if args.smoothing == 'laplace_lindstone': scoring_function = get_query_likelihood_score_laplace_lindstone_smoothing if args.smoothing == 'jelinek_mercer': scoring_function = get_query_likelihood_score_jelinek_mercer_smoothing if args.smoothing == 'dirichlet': scoring_function = get_query_likelihood_score_dirichlet_smoothing # query likelihood scores for each claim-doc combination docs_with_query_likelihood_scores = [ scoring_function(claim_terms, doc_with_terms) for doc_with_terms in doc_candidates.items() ] # zero values lead to random retrievals if all documents evaluate to zero, so might rather want to show no results if (args.remove_zero_likelihood): docs_with_query_likelihood_scores = list( filter(lambda x: x[1] != 0, docs_with_query_likelihood_scores)) # sort by query likelihood and limit to top results docs_with_query_likelihood_scores.sort(key=itemgetter(1), reverse=True) result_docs = docs_with_query_likelihood_scores[: DOCS_TO_RETRIEVE_PER_CLAIM] result_directory = '{}{}/'.format(RETRIEVED_PROBABILISTIC_DIRECTORY, args.smoothing or 'no_smoothing') display_or_store_result(claim, claim_id, result_docs, result_directory, args.print)
def retrieve_documents_for_claim(claim: str, claim_id: int): print(colored('Retrieving documents for claim [{}]: "{}"'.format(claim_id, claim), attrs=['bold'])) preprocessed_claim = preprocess_claim_text(claim) claim_terms = process_normalise_tokenise_filter(preprocessed_claim) claim_vector = get_tfidf_vector_for_claim(claim_terms) claim_norm = get_tfidf_vector_norm(claim_terms, args.variant) # only docs that appear in index for at least one claim term to be considered doc_candidates = get_candidate_documents_for_claim(claim_terms) # similarity scores for each claim-doc combination docs_with_similarity_scores = [ scoring_function(claim_terms, claim_vector, claim_norm, doc_with_terms) for doc_with_terms in doc_candidates.items()] # sort by similarity and limit to top results docs_with_similarity_scores.sort(key=itemgetter(1), reverse=True) result_docs = docs_with_similarity_scores[:DOCS_TO_RETRIEVE_PER_CLAIM] display_or_store_result(claim, claim_id, result_docs, RETRIEVED_TFIDF_DIRECTORY, args.print)
def process_generate_df_batch(id: int) -> Counter: colour = TERM_COLOURS[id % len(TERM_COLOURS)] print( colored('Start processing batch #{}'.format(id), colour, attrs=['bold'])) start_time = time.time() batch_file_path = '{}wiki-{:03}.jsonl'.format(DATA_WIKI_PATH, id) all_articles = read_jsonl_and_map_to_df(batch_file_path, ['text']) filtered_articles = filter_documents(all_articles) article_texts = reduce_document_to_text_column(filtered_articles) accumulated_batch_idfs = Counter() for index, raw_article in enumerate(article_texts): filtered_tokens = process_normalise_tokenise_filter(raw_article) # use set to prevent multiple occurrences of word in doc words_set = set(filtered_tokens) if (index % 5000 == 0): print( colored( 'Processing document [{} / {}] of batch #{}...'.format( index, len(article_texts), id), colour)) # count for included words will be one words_in_doc = Counter(words_set) accumulated_batch_idfs += words_in_doc print( colored('Finished processing batch #{} after {:.2f} seconds'.format( id, time.time() - start_time), colour, attrs=['bold'])) return accumulated_batch_idfs
def preprocess_doc_title(page_id: str) -> list: doc_title_preprocessed = add_padding_around_punctuation(page_id).replace( '_', ' ') return process_normalise_tokenise_filter(doc_title_preprocessed)