def reset(self): global cache_dictionary global final_dictionary global documents_dictionary cache_dictionary = None final_dictionary = None documents_dictionary = None Indexer.reset() Writer.reset() Stemmer.reset() Reader.reset()
def handle_files(file_list_ref, documents_dictionary_ref): terms_dictionary = {} if stem_mode: # This code take a document's text from the list and parsing & stemming the text for value in file_list_ref: doc_id = value[0] file_name = value[2] after_stemming = Stemmer.stemWithCache(Parser.start(value[1])) # This function update the document parameters __update_and_merge_dictionaries(doc_id, file_name, terms_dictionary, documents_dictionary_ref, after_stemming) # This function merge all the dictionary in loop and create dictionary for the whole part else: # This code take a document's text from the list and only parsing the text for value in file_list_ref: doc_id = value[0] file_name = value[2] after_parse = Parser.start(value[1]) # This function update the document parameters __update_and_merge_dictionaries(doc_id, file_name, terms_dictionary, documents_dictionary_ref, after_parse) # This function merge all the dictionary in loop and create dictionary for the whole part # This function create new temp posting file for each part Indexer.create_temp_posting_file(terms_dictionary)
def results_relevant_documents_to_one_query(self, query): print("results_relevant_documents_to_one_query") if (self.final_dic): dictionary_parser = Parse.parse_text(query) dictionary_stemm = Stemmer.stemming(dictionary_parser, self.stemm_mode) final_dictionary_query = {} for term, details in dictionary_stemm.items(): freq = details[0] final_dictionary_query[term] = freq result_rank = self.ranker.rank(self.stemm_mode, self.all_document, self.final_dic, self.path_folder, final_dictionary_query, self.semantic_mode) else: return [] print("finish") self.ranker.reset_rank() return list(result_rank.keys())
def __parse_stem(self, text): if self.stem_mode: # stem mode is True after_dictionary = Stemmer.stemWithCache(Parser.start(text)) else: # stem mode is False after_dictionary = Parser.start(text) return after_dictionary
def results_relevant_documents(self, queries_dictionary): print("results_relevant_documents") results = [] if (self.final_dic): for number, value in queries_dictionary.items(): # this is the dictionary of all terms in this query query_dictionary_description = {} query_title = value[0] query_description = value[1] query_narrative = value[2] # title parse_title = Parse.parse_text(query_title) stemm_title = Stemmer.stemming(parse_title, self.stemm_mode) # narrative - break down sentences and remove not relevant query_narrative = Parse.parse_query_narrative(query_narrative) # description + narrative query_description_narrative = query_narrative + ' ' + query_description parse_description_narrative = Parse.parse_text( query_description_narrative) stemm_description_narrative = Stemmer.stemming( parse_description_narrative, self.stemm_mode) # Normalize the number of occurrences of the term for term, details in stemm_description_narrative.items(): freq = details[0] if term in self.final_dic: idf = self.final_dic[term][1] # new freq with idf query_dictionary_description[term] = ( self.weight_idf * idf) + (self.weight_df * freq) number_of_term_in_query = len(query_dictionary_description) normalized_number_of_results = number_of_term_in_query / self.denominator normalized_number_of_results = int( normalized_number_of_results) # Dictionary with the normalized_number_of_results most common terms query_dictionary_description_most_common = dict( Counter(query_dictionary_description).most_common( normalized_number_of_results)) query_dictionary_description.clear() # chang dictionary final_dictionary_query = {} for term, details in stemm_title.items(): freq = details[0] final_dictionary_query[term] = freq # merge dictionary_rank_title and query_dictionary_description_most_common to final_dictionary_query for term, val in query_dictionary_description_most_common.items( ): if term in final_dictionary_query: final_dictionary_query[ term] = final_dictionary_query[term] + val else: final_dictionary_query[term] = val # send the number of occurrences of a word in a document to the ranker # The ranker return dictionary with [term] = [(d1,tf1),(d2,tf2)...] result_rank = self.ranker.rank(self.stemm_mode, self.all_document, self.final_dic, self.path_folder, final_dictionary_query, self.semantic_mode) # [ query1 , {term1: [(d1,tf1),(d2,tf2)...] , term2: [(d1,tf1),(d2,tf2)...]} ] results.append((number, list(result_rank.keys()))) print("finish") self.ranker.reset_rank() return results
def stem_vocabulary(self): stemmer = Stemmer() for key in self.vocabulary: stemmer._stem(key) stemmer.save_stemmed_vocabulary(self.path)
def run(self): global cache_dictionary global final_dictionary global documents_dictionary start_time = time.time() cache_dictionary = {} final_dictionary = {} documents_dictionary = {} # Creates a list with all of the file paths in the corpus. Pops to remove the corpus file path sub_dirs = [x[0] for x in os.walk(corpus_path)] sub_dirs.pop(0) files_list = [] # This list will save each part file_index = 1 # This index point to current file iterate_over_parts = 1 # This part point to the current part next_part = int( fileNum / parts) * iterate_over_parts # The last index of the first part if thread_mode == 'on': # Here we using ThreadPool # Init for ThreadPool with number of threads from config file executor = concurrent.futures.ThreadPoolExecutor( max_workers=number_of_threads) for subdir in sub_dirs: textList = Reader.separate(subdir) files_list.extend(textList) if file_index == next_part: executor.submit(handle_files, files_list, documents_dictionary) files_list = [] # cleaning the files list if not iterate_over_parts + 1 == parts: iterate_over_parts += 1 # update the last index of the next part next_part = (int(fileNum / parts) * iterate_over_parts) if file_index == fileNum: # The last index of the last part executor.submit(handle_files, files_list, documents_dictionary) break # if we not iterate over the whole corpus file_index += 1 # This function shut down the ThreadPool but wait until the Threads will finish executor.shutdown(wait=True) else: for subdir in sub_dirs: textList = Reader.separate(subdir) files_list.extend(textList) if file_index == next_part: handle_files(files_list, documents_dictionary) files_list = [] # cleaning the files list if not iterate_over_parts + 1 == parts: iterate_over_parts += 1 # update the last index of the next part next_part = (int(fileNum / parts) * iterate_over_parts) if file_index == fileNum: # The last index of the last part handle_files(files_list, documents_dictionary) break # if we not iterate over the whole corpus file_index += 1 sub_dirs = None files_list = None Stemmer.clean_cache() # Merge the temp files and removed them final_dictionary, cache_dictionary, posting_file_size = Indexer.merge_files( documents_dictionary) end_time = time.time() total_time = end_time - start_time # Stemmer.write_cache() print("Number of documents: " + str(len(documents_dictionary))) print("Number of terms: " + str(len(final_dictionary))) print("Time: " + str("{:.2f}".format(total_time)) + " seconds") print("Time: " + str("{:.2f}".format(total_time / 60)) + " minutes") final_dictionary_file_size = sys.getsizeof(final_dictionary) cache_file_size = sys.getsizeof(cache_dictionary) print("Posting file size: " + str(posting_file_size) + " Bytes") print("Dictionary file size: " + str(final_dictionary_file_size) + " Bytes") print("Cache file size: " + str(cache_file_size) + " Bytes") Writer.remove_temp_file() # Announce to the gui that indexing has concluded. global stem_mode self.view.finished_indexing(str(len(documents_dictionary)), str(final_dictionary_file_size), str(cache_file_size), str(int(total_time)), str(len(final_dictionary)), str(posting_file_size), stem_mode)
def find(self, query, stem_mode): total_value = 0 for x in range(0, 5): # We want to use this value for the weights total_value += x sum_of_df = 0 wiki_wiki = wikipediaapi.Wikipedia( 'en') # Which language we want to search the term for page_py = wiki_wiki.page(query) # Define the query in the file query_dictionary = {} # The dictionary we will return to the user if page_py.exists(): line = page_py.summary # Here we collect the summary about the page in wiki if len( line ) < 300: # If we wiki didn't return a specific term, we ask for the sections line = print_sections(page_py.sections) if stem_mode: stop_set = {'disambigu'} # Popular words we want to avoid query_after = Stemmer.stemWithCache(Parser.start(query)) terms_dictionary = Stemmer.stemWithCache(Parser.start(line)) else: stop_set = {'Disambiguation'} # Popular words we want to avoid query_after = Parser.start(query) terms_dictionary = Parser.start(line) concept = {} links = page_py.links # Here we collect the links from the page in wiki for title in sorted(links.keys()): if stem_mode: term = Stemmer.stemWithCache( Parser.start(links[title].title)) else: term = Parser.start(links[title].title) for t, value in term.items( ): # For each term in summary dictionary, we need to check the values if links[title].ns == 0 and t in terms_dictionary and \ t not in query_after and t not in stop_set: if t not in concept: concept[t] = value else: concept[ t] += value # we want to add the value (the df to the dictionary) # Here we ask only for most common query results query_dictionary = dict( Counter(concept).most_common(number_of_results)) for term, value in query_dictionary.items(): sum_of_df += value for term, value in query_dictionary.items(): positive_value = int(total_value * value / sum_of_df) + 1 if positive_value == 0: positive_value = 1 query_dictionary[term] = positive_value if len(query_after) is not 0: query = list(query_after.keys())[0] else: print("Invalid query") query_dictionary[query] = number_of_results return query_dictionary
def merge_all_posting(stemming_mode, posting_id, number_doc_in_corpus, the_final_terms_dictionary, cach_dictionary, all_city, max_doc_city): #check_uppercase() path_folder_posting, path_folder_abc_posting, stemming_mode, city_path = init_path( stemming_mode) print("merge_all_posting") finish = False number_of_line_in_abc_posting = {} all_final_posting_path = create_final_posting( path_folder_abc_posting, number_of_line_in_abc_posting, city_path) term_first_line_postings = {} freq_sum_doc_first_line_postings = {} the_open_posting_file = {} stemm_dictionary_values = [] if stemming_mode == 'yes': stemm_dictionary = Stemmer.get_dictionary() # all stemming_term stemm_dictionary_values = Stemmer.get_dictionary_value() elif stemming_mode == 'no': stemm_dictionary = Stemmer.get_dictionary_without_stemming( ) # all stemming_term cach_dictionary.clear() terms_to_updated = {} # The terms are in lower case letters close_file = {} # save the first line of each temp posting for index_file_of_posting in range(1, posting_id + 1): file_path = path_folder_posting + "\TempPostings" + str( index_file_of_posting) + '.txt' curr_posting_file = open(file_path, "r") the_open_posting_file[index_file_of_posting] = curr_posting_file close_file[index_file_of_posting] = False find_first_line(curr_posting_file, index_file_of_posting, term_first_line_postings, freq_sum_doc_first_line_postings, close_file) while not finish: #min_temp_posting = min(term_first_line_postings.keys(), key=(lambda index_post: term_first_line_postings[index_post])) min_term = min(term_first_line_postings.values()) all_posting_file_with_equal_term = [] list_doc = {} sum_tf = 0 df = 0 for index, term in term_first_line_postings.items(): if min_term == term: all_posting_file_with_equal_term.append(index) sum_tf = sum_tf + int( (freq_sum_doc_first_line_postings[index])[0]) df = df + int((freq_sum_doc_first_line_postings[index])[1]) list_doc.update((freq_sum_doc_first_line_postings[index])[2]) # Handling capitalization !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if min_term[0].isupper(): # Party # The lowercase_term = min_term.lower() # party # the if lowercase_term in stemm_dictionary: if stemming_mode == 'yes': lowercase_term_after_stemm = stemm_dictionary[ lowercase_term] # parti # the else: lowercase_term_after_stemm = lowercase_term if lowercase_term_after_stemm in terms_to_updated: sum_tf = sum_tf + terms_to_updated[ lowercase_term_after_stemm][0] list_doc.update( terms_to_updated[lowercase_term_after_stemm][1]) terms_to_updated[lowercase_term_after_stemm] = (sum_tf, list_doc) else: terms_to_updated[lowercase_term_after_stemm] = (sum_tf, list_doc) elif stemming_mode == 'yes' and lowercase_term in stemm_dictionary_values: if lowercase_term in terms_to_updated: sum_tf = sum_tf + terms_to_updated[lowercase_term][0] list_doc.update(terms_to_updated[lowercase_term][1]) terms_to_updated[lowercase_term] = (sum_tf, list_doc) else: terms_to_updated[lowercase_term] = (sum_tf, list_doc) else: cach_dictionary[min_term] = sum_tf calculations_and_income_to_final_dictionary( list_doc, sum_tf, df, number_doc_in_corpus, min_term, all_final_posting_path, number_of_line_in_abc_posting, the_final_terms_dictionary, all_city, max_doc_city) else: if min_term in terms_to_updated: # parti #the sum_tf = sum_tf + terms_to_updated[min_term][0] cach_dictionary[min_term] = sum_tf list_doc.update(terms_to_updated[min_term][1]) #print("final posting: " + min_term) calculations_and_income_to_final_dictionary( list_doc, sum_tf, df, number_doc_in_corpus, min_term, all_final_posting_path, number_of_line_in_abc_posting, the_final_terms_dictionary, all_city, max_doc_city) else: #print("final posting: " + min_term) cach_dictionary[min_term] = sum_tf calculations_and_income_to_final_dictionary( list_doc, sum_tf, df, number_doc_in_corpus, min_term, all_final_posting_path, number_of_line_in_abc_posting, the_final_terms_dictionary, all_city, max_doc_city) for i in all_posting_file_with_equal_term: find_first_line(the_open_posting_file[i], i, term_first_line_postings, freq_sum_doc_first_line_postings, close_file) finish = check_if_finish(close_file) ## out while close_all_files(all_final_posting_path) Stemmer.reset() reset_temp_posting() return sum_numbers