def run_engine(corpus_path='', output_path='', stemming=False): """ :return: """ # Create PostingFile directory if it doesn't exist number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=corpus_path) p = Parse(stemming) indexer = Indexer(config, output_path) # Get all parquet files from corpus path parquets = [] for root, dirs, files in os.walk(corpus_path): for name in files: if name.endswith((".parquet", ".htm")): parquets.append((root, name)) for index in range(len(parquets)): r.corpus_path = parquets[index][0] documents_list = r.read_file(file_name=parquets[index][1]) # Create a new process for each document with Pool(CPUCOUNT) as _p: for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list): number_of_documents += 1 indexer.add_new_doc(parsed_doc) _p.close() _p.join() p.entities.clear() indexer.finish_index() save_obj(indexer.term_dict, output_path + '/' + "inverted_idx") save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary") indexer.document_dict.clear() indexer.term_dict.clear()
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None): """ This function build the inverted index over the corpus. send each tweet to parsing and indexing. if the stemming is True the parsing will use the stemmer on the tokens. :param glove_dict: Glove file including all word vectors :param corpus_path: root folder containing the raw tweet files :param output_path for the inverted index, posting files and tweets dictionary :param stemming if True use stemmer on terms """ config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path) r = ReadFile(corpus_path=config.get_corpusPath()) p = Parse(stemming) indexer = Indexer(config) all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet") all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths] start_time = time.time() file_counter = 0 for file_name in all_files_names: file_start_time = time.time() # print("start file :", file_counter) documents_list = [document for document in r.read_file(file_name=file_name)] # Iterate over every document in the file for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document, glove_dict) # print("end file number ", file_counter, " in: ", time.time() - file_start_time) file_counter += 1 total_time = time.time() - start_time indexer.finish_indexing()
def run_engine(corpus_path, output_path, stemming=False): """ :param corpus_path: path for parquet files :param output_path: path to write pickle files :param stemming: boolean to use stemming or not :return: """ ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path) p = Parse(stemming) indexer = Indexer(output_path, stemming) if corpus_path.endswith('parquet'): documents_list = r.read_file(corpus_path) parseAndIndexDocuments(documents_list, p, indexer) else: documents_list = r.read_dir() while documents_list: parseAndIndexDocuments(documents_list, p, indexer) documents_list = r.read_dir() documents_list.clear() indexer.merge_posting_files() lda = LDA(output_path, indexer.dictdoc, stemming) lda.build_ldaModel()
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ number_of_documents = 0 config = ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, p.terms_dic_to_document) # Iterate over every document in the file for i in r.filesPath: documents_list = r.read_file(i) start_time = time.time() for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) # update the number of doc in system number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # print(time.time() - start_time) print('--------------------------') print('Start writing to disk left overs') indexer.save_all_left_overs() print('Finish without waiting ' + str(time.time() - start_time)) print('Start waiting') indexer.wait_untill_all_finish() print('End Waiting') print('Finished writing to disk left overs') print('--------------------------') print('Finished parsing and indexing. Starting to export files') print('Finish all Time ' + str(time.time() - start_time)) utils.save_obj(indexer.inverted_idx, "inverted_idx")
def run_engine(config): """ :return: """ number_of_documents = 0 output_path = config.savedFileMainFolder r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) m_Indexer = Indexer(output_path) parquetPaths = [] for (dirPath, dirNames, fileNames) in os.walk(config.get__corpusPath()): for fileName in fileNames: parquetPaths.append((dirPath + '\\' + fileName)) for i in range(len(parquetPaths)): parquetPaths[i] = parquetPaths[i][parquetPaths[i].find('\\') + 1:] if ".DS_Store" in parquetPaths[i]: continue parquet = r.read_file(file_name=parquetPaths[i]) for document in parquet: number_of_documents += 1 parsed_document = p.parse_doc(document) # index the document data m_Indexer.add_new_doc(parsed_document) # if there's more postings to flush, do it. if len(m_Indexer.postingDictionary) > 0: utils.save_obj(m_Indexer.postingDictionary, m_Indexer.postingsPath + '/' + str(m_Indexer.pkl_key)) # Clear single terms and entities, updated inverted index to disk. clearSingleEntities(m_Indexer.inverted_idx, p, output_path, m_Indexer.num_of_docs_in_corpus) utils.save_obj(m_Indexer.inverted_idx, output_path + '/inverted_idx') m_Indexer.inverted_idx.clear() utils.save_obj(number_of_documents, output_path + '/PostingFiles/num_of_docs_in_corpus')
def test_add_new_doc(self): config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me' assert indexer.add_new_doc() text = 'i wad born in 2019'
def test_reader(): global num_test_failed, results_summary num_test_failed = 0 r = ReadFile(corpus_path) correct_answers = [x['len'] for x in reader_inputs] student_answers = [ len(r.read_file(x['file'])) for x in reader_inputs ] test_part(correct_answers, student_answers, error_str="read") if num_test_failed == 0: results_summary.append('All Reader tests passed')
def run_engine(config, indexer): """ :return: """ number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config) doc = r.read_file('benchmark_data_train.snappy.parquet') for document in doc: parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document) number_of_documents += 1 capital_letters = p.caps_dict indexer.change_inverted_by_caps(capital_letters) indexer.save_index('idx_bench')
def run_engine(config): """ :return: """ number_of_documents = 0 sum_of_doc_lengths = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config, glove_dict) # documents_list = r.read_file(file_name=config.get__corpusPath()) parquet_documents_list = r.read_folder(config.get__corpusPath()) for parquet_file in parquet_documents_list: documents_list = r.read_file(file_name=parquet_file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 sum_of_doc_lengths += parsed_document.doc_length # index the document data indexer.add_new_doc(parsed_document) # saves last posting file after indexer has done adding documents. indexer.save_postings() if len(indexer.doc_posting_dict) > 0: indexer.save_doc_posting() utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path()) if len(indexer.document_posting_covid) > 0: indexer.save_doc_covid() indexer.delete_dict_after_saving() # merges posting files. indexer.merge_chunks() utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path()) dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents } utils.save_dict(dits, 'details', config.get_out_path())
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ config = ConfigClass(corpus_path, output_path, stemming) number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) Parse.stemmer = stemming corpus_list = r.read_corpus() for idx in range(len(corpus_list)): documents_list = r.read_file(file_name=corpus_list[idx], read_corpus=True) for i in tqdm(range(len(documents_list))): parsed_document = p.parse_doc(documents_list[i]) if i == len(documents_list) - 1 and idx == len(corpus_list) - 1: indexer.is_last_doc = True indexer.add_new_doc(parsed_document) number_of_documents += 1 indexer.is_last_doc = False documents_list = [] with open('spell_dict.json', 'w') as f: json.dump(indexer.spell_dict, f) pickle_out = open("docs_dict_and_extras", "wb") pickle.dump(indexer.docs_dict, pickle_out) pickle_out.close() start = time.time() indexer.merge_files() end = time.time() print("merge time was: {}".format(end - start)) utils.save_obj(indexer.inverted_idx, "inverted_index") pickle_out = open("docs_dict_and_extras", "ab") pickle.dump(number_of_documents, pickle_out) pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out) pickle.dump(indexer.dump_path, pickle_out) pickle_out.close()
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ r = ReadFile() df = r.read_file(fn) documents_list = df # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.')
def run_engine(corpus_path=None, output_path=None, stemming=False, lemma=False, queries=None, num_docs_to_retrieve=None): """ :return: """ global config, number_of_documents number_of_documents = 0 config = ConfigClass() config.corpusPath = corpus_path config.set_output_path(output_path) config.toStem = stemming config.toLemm = lemma if os.path.exists(config.get_output_path()): shutil.rmtree(config.get_output_path()) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem, config.toLemm) indexer = Indexer(config) documents_list = [] for root, dirs, files in os.walk(corpus_path): r.set_corpus_path(root) for file in files: if file.endswith(".parquet"): documents_list += r.read_file(file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) documents_list.clear( ) # Finished parsing and indexing all files - need to clean all the used memory indexer.cleanup(number_of_documents)
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ number_of_documents = 0 r = ReadFile(corpus_path=self._config.get__corpusPath()) doc = r.read_file(fn) for document in doc: parsed_document = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_document) number_of_documents += 1 capital_letters = self._parser.caps_dict self._indexer.change_inverted_by_caps(capital_letters) self._indexer.save_index('idx_bench') print('Finished parsing and indexing.')
def run_engine(): """ :return: """ number_of_documents = 0 timer = True config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() #p = Parse(with_stemmer=True) indexer = Indexer(config) data_dir = 'Data' + os.sep + 'Data' npy_dirs = [root for root, dirs, files in os.walk(data_dir)] for dir_path in npy_dirs: files = [ os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if fname.endswith('.parquet') ] for file in files: tweets = r.read_file(file_name=file) start_time = time.perf_counter() documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets) end_time = time.perf_counter() avg_time_per_tweet = (end_time - start_time) / len(tweets) print( f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds' ) start_time = time.perf_counter() for parsed_document in documents_list: indexer.add_new_doc(parsed_document) end_time = time.perf_counter() print( f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds' ) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ rd = ReadFile(fn) documents_list = rd.read_file() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.thresh_hold = 100000 self._indexer.thresh_hold_handler() self._indexer.save_index("inverted_idx")
def run_engine(): """ :return: """ number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def run_engine(): """ :return: """ number_of_documents = 0 corpus_path = config.get__corpusPath() r = ReadFile(corpus_path) indexer = Indexer(config) p = Parse(config) #reading per folder r.create_files_name_list() files_list = [] # every index contains all tweets per folder for file_name in r.dates_list: tweets_per_date = r.read_file(file_name) files_list.append(tweets_per_date) #print("files_list", len(files_list)) num_of_tweets = 0 for folder_list in files_list: num_of_tweets += len(folder_list) #print("num_of_tweets", num_of_tweets) """#reading per folder r.create_files_name_list() threads = [] for file_name in r.dates_list: t = threading.Thread(target=r.read_file(file_name)) threads.append(t) t.start() print("files_list", r.files_list)""" """counter = 1 procs = [] # Iterate over every folder in the DATA for folder_list in files_list: proc = Process(target=test, args=(folder_list, counter, indexer, number_of_documents,)) procs.append(proc) proc.start() # complete the processes for proc in procs: proc.join() print('Finished parsing and indexing. Starting to export files')""" counter = 1 # Iterate over every folder in the DATA for folder_list in files_list: #print(counter) #print(datetime.now()) # Iterate over every tweet in the folder for idx, tweet in enumerate(folder_list): # parse the tweet parsed_document = p.parse_doc(tweet) number_of_documents += 1 # index the tweet data indexer.add_new_doc(parsed_document, num_of_tweets) #print("number of tweets", number_of_documents) #print(datetime.now()) counter += 1 #print('Finished parsing and indexing. Starting to export files') """#read only one folder documents_list = r.read_file(file_name='') num_indexed = len(documents_list) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document, num_indexed) #print('Finished parsing and indexing. Starting to export files')""" utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.tf_idf_dict, "tf_idf_dict") return indexer.get__lda__()
# r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-15-2020\covid19_07-15.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-16-2020\covid19_07-16.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-18-2020\covid19_07-18.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-20-2020\covid19_07-20.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-04-2020\covid19_08-04.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-27-2020\covid19_07-27.snappy.parquet", # ] files_to_process = [ r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-07-2020\covid19_08-07.snappy.parquet", r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-06-2020\covid19_08-06.snappy.parquet", r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-05-2020\covid19_08-05.snappy.parquet", ] for file in files_to_process: documents_list += reader.read_file(file) with open(preprocessed_file, "a+") as f: for idx, document in enumerate(documents_list): # parse the document parsed_document = parser.parse_doc(document) doc = "" for i, word in enumerate(parsed_document.term_doc_dictionary): if word == "#" or "#_" in word: continue if i == len(parsed_document.term_doc_dictionary) - 1: doc = doc.replace('\n', "") doc += "\n" break
last_tweet_num = 0 if start_from_backup: with open("backup_file.pkl", 'rb') as f: ls = pickle.load(f) tweet_annotation_dict = f[0] last_tweet_num = [1] for dir in os.listdir(data_path): dir_path = os.path.join(data_path, dir) if not os.path.isdir(dir_path): continue for file in os.listdir(dir_path): file_path = os.path.join(dir, file) if file[-7:] != "parquet": continue print(f'Reading {file}...') file_as_list = rdr.read_file(file_path) for tweet in file_as_list: if int(tweet[0]) in tweets_queries_dict: if start_from_backup: tweet_num += 1 if tweet_num == last_tweet_num: start_from_backup = False break print( f"Query: {queries[tweets_queries_dict[int(tweet[0])]-1]}") print( f"Tweet {tweet_num}/{len(tweets_queries_dict)}:\n{tweet}") annotation = "" while annotation != 0 and annotation != 1: try: annotation = int(
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): """ init engine with the relevant model - Thesaurus_Searcher :param config: """ self._config = config try: self._reader = ReadFile(corpus_path=config.get__corpusPath()) except: self._reader = ReadFile("") self._parser = Parse() self._parser.STEMMER = config.toStem self._indexer = Indexer(config) self._model = Thesaurus_Searcher(self._indexer) self.last_parquet = False # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ documents_list = self._reader.read_file(fn) # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data if self.last_parquet and idx == len(documents_list) - 1: self._indexer.last_doc = True self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query, 1500)
def run_engine(corpus_path, output_path, stemming=False): """ Builds the retrieval model. Preprocess, parse and index corpus. :return: a tuple of number_of_documents in the corpus and average_document_length """ number_of_documents = 0 total_document_length = 0 reader = ReadFile(corpus_path) parser = Parse() indexer = Indexer(output_path) # read all parquet data files files = glob(corpus_path + "/**/*.parquet", recursive=True) # read, parse and index document in batches. Posting files are divided by english alphabet # a batch is defined as all the documents in a single parquet file # each batch is first written as many sub-batches indicated by an index and later merged into one coherent batch batch_index = 0 file_index = 0 while file_index < len(files): # batch two files at a time to reduce disk seek time penalty first_file = files[file_index] first_documents_list = reader.read_file(first_file) if file_index + 1 < len(files): second_file = files[file_index + 1] second_documents_list = reader.read_file(second_file) documents_list = first_documents_list + second_documents_list else: # if only one batch left for the last batch documents_list = first_documents_list file_index += 2 # Iterate over every document in the file # parse documents parsed_file = set() for document_as_list in documents_list: parsed_document = parser.parse_doc(document_as_list, stemming) parsed_file.add(parsed_document) total_document_length += parsed_document.doc_length number_of_documents += 1 # index parsed documents indexer.index_batch(parsed_file, str(batch_index)) batch_index += 1 # calculate average document length average_document_length = float( total_document_length) / number_of_documents # after indexing all non-entity terms in the corpus, index legal entities indexer.index_entities() # save index dictionary to disk utils.save_obj(indexer.inverted_idx, output_path + "inverted_idx") # after indexing the whole corpus, consolidate all partial posting files indexer.consolidate_postings() return number_of_documents, average_document_length