def search(search_phrase, invert_ref) -> list: document_results = list() for token in ctoken.tokenize(search_phrase): token_dict = defaultdict(dict) for docID in invert_ref[token].keys(): token_dict[token][docID] = { "bold_bool": invert_ref[token][docID]["bold_bool"], "count": len(invert_ref[token][docID]["positions"]), #"tfidf": gen_tfidf(word=token, docID=docID, invert_ref=invert_ref), } document_results.append(token_dict) return rank(document_results)
def make_fragment_index(): """ Creates a fragmented index in the form of a few .txt files and a json for termID's and a json for docID's. :return: Creates docid.map, a json file that is a list of urls where the index is the docID Creates termID.map, a json file with a list of terms, where the termID is the index. Creates a few 'bucket' txt files, where the filename refers to the last added termID. Each bucket file contains alphabetical rows of "term~{document:[positions]}" These are partial datasets that must be merged in next step. """ if Bard: print("================================================") print( f"LIBRARY SIZE: {utils.get_size_directory(DATA_DIR) / 1000000}mb") print(f"LIBRARY SIZE: {FILE_COUNT} files") print(f"INDEX BUFFER SIZE: {INDEX_BUFFER / 1000000}mb") print( f"BUFFER FRACTION: {INDEX_BUFFER / utils.get_size_directory(DATA_DIR)}" ) print("================================================") print("<--------------MAKING FRAGMENTED INDEX-----------------> 1/2") document_store = list() inverted_index_bucket = defaultdict( lambda: defaultdict(list)) # {token:{docID:{positions:}}} supplemental_info = defaultdict( lambda: defaultdict(list)) # {docID:{links:[], bolds:[]}} docID_hashes = defaultdict(int) tokenIDs = set() docID = 0 doc_count = utils.count_docs(DATA_DIR) # BARD IN! if Bard: Bard.start("WALKING DATA") for subdir, dirs, files in os.walk(DATA_DIR): for file in files: if ".json" in file: # If bucket is at-size, dump bucket if sys.getsizeof(inverted_index_bucket) > INDEX_BUFFER: print("\n<---------WRITING BUCKET--------->") write_bucket(inverted_index_bucket, INDEX_DIR, docID) inverted_index_bucket = defaultdict( lambda: defaultdict(list) ) # {token:{docID:{positions:}}} # Open url-database file with open(os.path.join(subdir, file), "r") as f: jsonfile = json.load(f) f.close() document_store.append(jsonfile["url"]) soup = BeautifulSoup(jsonfile["content"], features="lxml") tokens = ctoken.tokenize(soup.text) # GENERATE TOKEN HASH if not USE_SIMHASH: docID_hashes[docID] = hashlib.md5( soup.text.encode('utf-8')).hexdigest() # EXTRACT BOLDS for bold in soup.find_all( ["b", "strong", "h1", "h2", "h3", "title"]): [ supplemental_info[docID][BOLDS_KEY].append(word) for word in ctoken.tokenize(bold.text) ] # EXTRACT LINKS # [(link, anchor_text)] links = list() for link in BeautifulSoup( jsonfile["content"], features="lxml", parse_only=SoupStrainer("a")).find_all("a"): try: if hasattr(link, "href") and validators.url( link["href"]): links.append( (link["href"], ctoken.tokenize(link.text))) except KeyError: pass supplemental_info[docID][LINKS_KEY] = [ link[0] for link in links ] # TOKENIZE! position = 0 # INDEX ANCHOR TEXT for link in links: if link[1]: for token in link[1]: inverted_index_bucket[token][str(docID)].append( position) position += 1 tokenIDs.add(token) # add rest of tokens for token in tokens: inverted_index_bucket[token][str(docID)].append(position) position += 1 tokenIDs.add(token) # BARD BARDING! percent_progress = "{:.5f}".format(docID / doc_count) percent_progress_human = "{:.5f}".format( (docID / doc_count) * 100) if Bard: # and percent_progress % 0.000001 == 0: log = f"{percent_progress_human}% bucket size:{sys.getsizeof(inverted_index_bucket) / 1000000}mb " \ f"{subdir}/{file} tokens:{len(tokens)} bolds:{len(supplemental_info[docID][BOLDS_KEY])} " \ f"links:{len(links)}" Bard.update(log, replace=True) # INCREMENTING DOCID COUNTER docID += 1 print("<---------WRITING FINAL BUCKET--------->") write_bucket(inverted_index_bucket, INDEX_DIR, docID) inverted_index_bucket.clear() print("<---WRITING DOCUMENT STORE------->") with open(docID_store_file_filename, "w") as f: json.dump(document_store, f) f.close() print("<-----WRITING TERM ID STORE------>") sorted_tokenIDs = sorted(list(tokenIDs)) with open(termID_map_filename, "w") as f: json.dump(sorted_tokenIDs, f) f.close() print("<-----WRITING SUPPLEMENTAL LINK/BOLD STORE------>") with open(supplemental_info_filename, "w") as f: json.dump(supplemental_info, f) f.close() print("<-----WRITING HASH STORE------>") with open(docID_hash_filename, "w") as f: json.dump(docID_hashes, f) f.close() print("<-----WRITING INVERTED DOCID MAP------>") with open(docID_store_file_filename, "r") as f: inverted_docID_map = page_duplicate_util.gen_inverted_docID_map( json.load(f)) with open(invert_docID_filename, "w") as f: json.dump(inverted_docID_map, f) print("<-----WRITING INVERTED BOLDS MAP------>") utils.make_invert_bolds_term_docID(supplemental_info_filename, inverted_bolds_filename) print("<-----WRITING PAGERANK STORE------>") utils.make_pagerank_lib(supplemental_info_filename, docID_store_file_filename, pagerank_filename) # BARD OUT! if Bard: Bard.end()
print("Loading maps of meaning...") # add maps back to memory with open(token_seek_map_filename, "r") as f: from_file_token_map = json.load(f) with open(docID_store_file_filename, "r") as f: docID_store = json.load(f) with open(docID_hash_filename, "r") as f: docID_hash = json.load(f) with open(supplemental_info_filename, "r") as f: bolds_links_store = json.load(f) with open(corpus_token_frequency_filename, "r") as f: corpus_token_frequency = json.load(f) with open(invert_docID_filename, "r") as f: invert_docID_map = json.load(f) stopwords = [ctoken.tokenize(i)[0] for i in stopwords.words('english')] sorted_corpus = [(term, count) for term, count in sorted( corpus_token_frequency.items(), key=lambda x: x[1], reverse=True) if term not in stopwords] duplicate_docIDs = page_duplicate_util.find_duplicates(docID_hash) docID_links = { docID_store[int(docID)]: bolds_links_store[docID]["links"] for docID in bolds_links_store.keys() if bolds_links_store[docID]["links"] } sorted_pagerank = sorted(cpagerank.pagerank(docID_links, invert_docID_map).items(), key=lambda x: x[1], reverse=True) PRINT_TERMS = 10
def search_multiple(search_queries_ref, token_map_ref, docid_store_ref, findex_dict, duplicate_docIDS, bold_links_map, sorted_pagerank_ref, inverted_bolds): ret_results = defaultdict(tuple) for query_phrase in search_queries_ref: time_search_start = time.perf_counter() term_docID_positions = defaultdict( lambda: defaultdict(list)) # {term{docID:[positions]}} union = set() # [docID] # QUERY tokenized_query = ctoken.tokenize(query_phrase, trim_stopwords=False) # [query_term] unique_query_term_freqs = defaultdict(int) # {term:freqs} # QUERY TFIDF query_weighted_tfidf_dict = defaultdict(float) # {term:float} query_normalized_weighted_tfidf_dict = defaultdict( float) # {term:float} term_idf = defaultdict(float) # {term:weighted_idf} # DCOUMENT TFIDF document_normalized_weighted_tf_dict = defaultdict( lambda: defaultdict(float)) # {docID:{term:float}} final_doc_scores = defaultdict(float) # {docID:float}} all_stopwords = False # FREQ'ING QUERY # if the query is extremely long, stop words are removed if len(tokenized_query) > 3: tokenized_query = [ token for token in tokenized_query if token not in set(stopwords.words('english')) ] unique_keys = set(tokenized_query) for toke in tokenized_query: unique_query_term_freqs[toke] += 1 # ACCESSING DOCID's AND POSITIONS # accounts for posting information stored in multiple files for toke in unique_keys: # posting list(s) try: for file, seek in token_map_ref[toke].items(): f = findex_dict[file] for seek_pos in seek: f.seek(seek_pos) # sends cursor to position posting = orjson.loads(f.readline().replace( "'", "\"")) # {docID:[seek_positions]} for docID in posting.keys(): # trim off duplicate pages if docID not in duplicate_docIDS and docID not in term_docID_positions[ toke]: [ term_docID_positions[toke][docID].append( pos) for pos in posting[docID] ] union.add(docID) except KeyError: # removing terms that do not exist in corpus del unique_query_term_freqs[toke] continue file_time2 = time.perf_counter() # print("filetime:", (file_time2-file_time1)*1000) # QUERY TFIDF VECTOR for term in unique_query_term_freqs.keys(): # weighted term frequency weighted_tf = 1 + math.log(unique_query_term_freqs[term]) # weighted number of documents / number of documents containing term weighted_idf = math.log( len(docid_store_ref) / len(term_docID_positions[term])) query_tfidf = weighted_tf * weighted_idf query_weighted_tfidf_dict[term] = query_tfidf term_idf[term] = weighted_idf query_n_term = normalization_term( query_weighted_tfidf_dict.values()) # sqrt(sum(query_vector)) for term in query_weighted_tfidf_dict.keys(): query_normalized_weighted_tfidf_dict[ term] = query_weighted_tfidf_dict[term] / query_n_term # BOLDS EXTRACTION bolds_docID = set() for term in [ query for query in query_weighted_tfidf_dict.keys() if query not in set(stopwords.words('english')) ]: try: [bolds_docID.add(docID) for docID in inverted_bolds[term]] except IndexError and KeyError: pass # DOCUMENT PRUNING-------------------------------------------------- # boolean AND documents # boolan AND + bolds # boolean AND + bolds + iter(union each set of docIDs for term in reverse idf order) # still none? Nuclear option -> union with all docs containing term datasets = [] MINIMUM_ITERS = 1000 MAXIMUM_ITERS = 2000 try: searchdocs = set.intersection(*[ set(term_docID_positions[term].keys()) for term in term_docID_positions.keys() ]) datasets.append(f"boolean AND{len(searchdocs)}") except TypeError: searchdocs = set() pass predicted_iterations = len(searchdocs) * len( query_normalized_weighted_tfidf_dict.keys()) while predicted_iterations < MINIMUM_ITERS: if all_stopwords: break counter = 0 for term, idf in sorted(term_idf.items(), key=lambda x: x[1], reverse=True): searchdocs = set.union(*[ searchdocs, [docID for docID in term_docID_positions[term]] ]) datasets.append( f"union-termdocs{counter}-{term}-{len(searchdocs)}") counter += 1 predicted_iterations = len(searchdocs) * len( query_normalized_weighted_tfidf_dict.keys()) if predicted_iterations > MINIMUM_ITERS: break if predicted_iterations > MINIMUM_ITERS: break searchdocs = set.union(*[searchdocs, bolds_docID]) datasets.append(f"union-bolds{len(searchdocs)}") predicted_iterations = len(searchdocs) * len( query_normalized_weighted_tfidf_dict.keys()) if predicted_iterations > MINIMUM_ITERS: break searchdocs = set.union(*[searchdocs, union]) datasets.append(f"union-union{len(searchdocs)}") predicted_iterations = len(searchdocs) * len( query_normalized_weighted_tfidf_dict.keys()) break while predicted_iterations > MAXIMUM_ITERS: # do some filtering, thats alot of results. searchdocs = set.intersection(*[searchdocs, bolds_docID]) datasets.append(f"intersect-bolds{len(searchdocs)}") predicted_iterations = len(searchdocs) * len( query_normalized_weighted_tfidf_dict.keys()) if predicted_iterations < MAXIMUM_ITERS: break searchdocs = set.intersection(*[searchdocs, union]) datasets.append(f"intersect-union{len(searchdocs)}") break datasets.append( f"iterations:{len(searchdocs) * len(query_normalized_weighted_tfidf_dict.keys())}" ) # DOCUMENT TF-IDF SCORING --------------------------------- # COMPUTE DOCUMENTS TF'S # term frequency in document, list of score values() for all terms for docID in searchdocs: for term in unique_query_term_freqs.keys(): # weighted document tf == 1+log(term frequency in document) # document_normalized_weighted_tf_dict[docID].values() == "document vector" or term_scores if len(term_docID_positions[term][docID]) == 0: document_normalized_weighted_tf_dict[docID][term] = \ 1 + math.log(1) else: document_normalized_weighted_tf_dict[docID][term] =\ 1 + math.log(len(term_docID_positions[term][docID])) for term in unique_query_term_freqs.keys(): # NORMALIZE DOCUMENT TF n_term = normalization_term( document_normalized_weighted_tf_dict[docID].values()) document_normalized_weighted_tf_dict[docID][term] =\ document_normalized_weighted_tf_dict[docID][term] / n_term # COMPUTE FINAL TERM-DOCUMENT RELEVANCE SCORE----------------------- # sum(for all terms: query_term_score*document_normalized_weighted_tf_dict) non_tfidf_weighting_factor = 1 for docID in document_normalized_weighted_tf_dict.keys(): sum_list = list() for term in document_normalized_weighted_tf_dict[docID].keys(): doc_score = query_normalized_weighted_tfidf_dict[ term] * document_normalized_weighted_tf_dict[docID][term] if doc_score == 0: # its an irrelevant document continue if len(document_normalized_weighted_tf_dict[docID].keys() ) == 1: # the query was a single word non_tfidf_weighting_factor = 10 if BOLDS_WEIGHTING and docID in bolds_docID: doc_score += 0.0001 * non_tfidf_weighting_factor # print("we struck BOLD!") if PAGERANK_WEIGHTING: try: page_rank = sorted_pagerank_ref[int(docID)][1] if page_rank > 0.001: doc_score += page_rank * non_tfidf_weighting_factor pass except IndexError: pass sum_list.append(doc_score) final_doc_scores[docid_store_ref[int(docID)]] = sum(sum_list) # SORT AND SLICE RESULTS ---------------------------- search_results = sorted(final_doc_scores.items(), key=lambda x: x[1], reverse=True)[0:SEARCH_RESULTS] time_taken = time.perf_counter() - time_search_start # print(tokenized_query, "results", "\n", len(final_doc_scores), "datasets", datasets) if union: ret_results[query_phrase] = (search_results, time_taken) else: ret_results[query_phrase] = ([], time_taken) return ret_results