def main(): ''' The main loop for the program ''' config = ConfigClass() se = search_engine_best.SearchEngine(config=config) r = ReadFile(corpus_path=config.get__corpusPath()) # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1] # se.build_index_from_parquet(parquet_file_path) se.load_index('idx_bench') g = GUI() # s.load_existing_index() # load if exists, otherwise return empty list while True: event, values = g.window.read() if event is None: break if event == '_SEARCH_': g.clear() query = values['TERM'] start = datetime.now() relevant, tweets_id = se.search(query) end = datetime.now() total_time = (end - start).total_seconds() # print the results to output element index = 0 for tweet_id in tweets_id: if index < 25: print("%s. tweet id: %s" % (index + 1, tweet_id)) index += 1 print() print("About %s tweets (%s seconds)" % (relevant, total_time))
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ number_of_documents = 0 config = ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, p.terms_dic_to_document) # Iterate over every document in the file for i in r.filesPath: documents_list = r.read_file(i) start_time = time.time() for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) # update the number of doc in system number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # print(time.time() - start_time) print('--------------------------') print('Start writing to disk left overs') indexer.save_all_left_overs() print('Finish without waiting ' + str(time.time() - start_time)) print('Start waiting') indexer.wait_untill_all_finish() print('End Waiting') print('Finished writing to disk left overs') print('--------------------------') print('Finished parsing and indexing. Starting to export files') print('Finish all Time ' + str(time.time() - start_time)) utils.save_obj(indexer.inverted_idx, "inverted_idx")
def test_add_new_doc(self): config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me' assert indexer.add_new_doc() text = 'i wad born in 2019'
def write_content_for_tweet_id(): corpus_path = "C:\\Users\\ASUS\\Desktop\\Data" config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) names = r.get_files_names_in_dir() with open("text.csv", "w", newline='', encoding='utf-8') as f: writer = csv.writer(f) for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for doc in documents_list: if doc[0] in tweet_ids: writer.writerow([doc[0], doc[2]])
def run_engine(corpus_path='', output_path='.', stemming=False): """ Entry point for corpus parsing and indexing :param corpus_path: :param output_path: :param stemming: boolean that says if stemming should be apllied :return: total number of tweets parsed """ config = ConfigClass(corpus_path, stemming, output_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) tweets_parsed = parse_wrapper(r, p, config)
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve, word2vec): """ :return: """ # print("start: ", time.asctime(time.localtime(time.time()))) number_of_documents = 0 num_of_writes = 1 config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, word2vec) # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet') # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool) # Iterate over every document in the file counter = 0 names = r.get_files_names_in_dir() for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) # parse the document if parsed_document == {}: # RT continue number_of_documents += 1 indexer.add_new_doc(parsed_document, num_of_writes) # index the document data counter += 1 if counter >= 500000: write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) counter = 0 # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time()))) num_of_writes += 1 # print('Finished parsing and indexing. Starting to export files') write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) # print("finish parser & index: ", time.asctime(time.localtime(time.time()))) indexer.inverted_idx = { key: val for key, val in indexer.inverted_idx.items() if val != 1 } utils.save_obj(indexer.inverted_idx, "inverted_idx") # print("finish save index: ", time.asctime(time.localtime(time.time()))) return num_of_writes
def run_engine(corpus_path_, output_path_, stemming_): """ :return: """ number_of_documents = 0 config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_) config.corpusPath = corpus_path_ config.savedFileMainFolder=output_path_ r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) pathes = r.get_all_path_of_parquet() length_of_array = len(pathes) iteration = 0 is_stemmer = config.toStem parsed_doc_list = list() for i in range(0, length_of_array): documents_list = r.get_documents(pathes[i][0], pathes[i][0]) for doc, j in zip(documents_list, range(len(documents_list))): parsed_document = p.parse_doc(doc, stemmer=is_stemmer) if parsed_document == None: continue parsed_doc_list.append(parsed_document) number_of_documents += 1 if number_of_documents % 200000 == 0: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) iteration += 1 parsed_doc_list.clear() parsed_doc_list = list() elif j == len(documents_list) - 1 and i == length_of_array - 1: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) parsed_doc_list.clear() parsed_doc_list = list() indexer.merge_posting_file() indexer.merge_two_last_posting_file() indexer.split_posting_file_and_create_inverted_index() indexer.write_inverted_index_to_txt_file() number_of_documents = 0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ config = ConfigClass(corpus_path, output_path, stemming) number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) Parse.stemmer = stemming corpus_list = r.read_corpus() for idx in range(len(corpus_list)): documents_list = r.read_file(file_name=corpus_list[idx], read_corpus=True) for i in tqdm(range(len(documents_list))): parsed_document = p.parse_doc(documents_list[i]) if i == len(documents_list) - 1 and idx == len(corpus_list) - 1: indexer.is_last_doc = True indexer.add_new_doc(parsed_document) number_of_documents += 1 indexer.is_last_doc = False documents_list = [] with open('spell_dict.json', 'w') as f: json.dump(indexer.spell_dict, f) pickle_out = open("docs_dict_and_extras", "wb") pickle.dump(indexer.docs_dict, pickle_out) pickle_out.close() start = time.time() indexer.merge_files() end = time.time() print("merge time was: {}".format(end - start)) utils.save_obj(indexer.inverted_idx, "inverted_index") pickle_out = open("docs_dict_and_extras", "ab") pickle.dump(number_of_documents, pickle_out) pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out) pickle.dump(indexer.dump_path, pickle_out) pickle_out.close()
def run_engine(corpus_path=None, output_path=None, stemming=False, lemma=False, queries=None, num_docs_to_retrieve=None): """ :return: """ global config, number_of_documents number_of_documents = 0 config = ConfigClass() config.corpusPath = corpus_path config.set_output_path(output_path) config.toStem = stemming config.toLemm = lemma if os.path.exists(config.get_output_path()): shutil.rmtree(config.get_output_path()) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem, config.toLemm) indexer = Indexer(config) documents_list = [] for root, dirs, files in os.walk(corpus_path): r.set_corpus_path(root) for file in files: if file.endswith(".parquet"): documents_list += r.read_file(file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) documents_list.clear( ) # Finished parsing and indexing all files - need to clean all the used memory indexer.cleanup(number_of_documents)
def run_engine(): """ :return: """ number_of_documents = 0 timer = True config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() #p = Parse(with_stemmer=True) indexer = Indexer(config) data_dir = 'Data' + os.sep + 'Data' npy_dirs = [root for root, dirs, files in os.walk(data_dir)] for dir_path in npy_dirs: files = [ os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if fname.endswith('.parquet') ] for file in files: tweets = r.read_file(file_name=file) start_time = time.perf_counter() documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets) end_time = time.perf_counter() avg_time_per_tweet = (end_time - start_time) / len(tweets) print( f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds' ) start_time = time.perf_counter() for parsed_document in documents_list: indexer.add_new_doc(parsed_document) end_time = time.perf_counter() print( f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds' ) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def run_engine(): """ :return: """ number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): if not config: self._config = ConfigClass() else: self._config = config self._parser = Parse() self._indexer = Indexer(self._config) self._model = None self._reader = ReadFile(self._config.get__corpusPath()) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.check_pending_list() self._indexer.calculate_and_add_idf() self._indexer.calculate_sigma_Wij() self._indexer.calculate_avg_doc_len() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_tuple = self._parser.parse_sentence(query) query_as_list = query_as_tuple[0] + query_as_tuple[1] searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query_as_list, k)
def run_engine(stemming='n'): """ :return: """ number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) indexer.stemming = stemming entries = os.listdir('Data') start = time.time() print(start) playsound('Ding.mp3') number_of_documents = 0 i = 1 # for entire in entries: # documents_list = r.read_file('Data/'+entire) # documents_list_length = len(documents_list) # # Iterate over every document in the file # for idx, document in enumerate(documents_list): # parsed_document = p.parse_doc(document) # indexer.add_new_doc(parsed_document, documents_list_length) # # break # documents_list = r.read_file(file_name='sample3.parquet') # doc_len = len(documents_list) # # Iterate over every document in the file # for idx, document in enumerate(documents_list): # # parse the document # parsed_document = p.parse_doc(document) # number_of_documents += 1 # # index the document data # indexer.add_new_doc(parsed_document, doc_len) # utils.save_obj(indexer.postingDict, "posting") # indexer.postingDict = None # utils.save_obj(indexer.tweet_dict, "tweet_dict") # indexer.tweet_dict = None # documents = os.listdir('posting_files') # for doc in documents: # indexer.read_and_add_to_temp_dict('posting_files/' + doc) # # # # playsound('Ding.mp3') # middle = time.time() # print('middle: ',(middle-start)/60," minutes") # print('Finished parsing and indexing. Starting to export files') # # utils.save_obj(indexer.inverted_idx, "inverted_idx") # utils.save_obj(indexer.reversed_inverted_index, "reversed_inverted_index") # indexer.inverted_idx = None # indexer.reversed_inverted_index = None r = Ranker() # for doc in documents: playsound('Ding.mp3') r.create_global_method() end = time.time() print(end) print('end: ', (end - start) / 60, " minutes")