def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None): """ This function build the inverted index over the corpus. send each tweet to parsing and indexing. if the stemming is True the parsing will use the stemmer on the tokens. :param glove_dict: Glove file including all word vectors :param corpus_path: root folder containing the raw tweet files :param output_path for the inverted index, posting files and tweets dictionary :param stemming if True use stemmer on terms """ config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path) r = ReadFile(corpus_path=config.get_corpusPath()) p = Parse(stemming) indexer = Indexer(config) all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet") all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths] start_time = time.time() file_counter = 0 for file_name in all_files_names: file_start_time = time.time() # print("start file :", file_counter) documents_list = [document for document in r.read_file(file_name=file_name)] # Iterate over every document in the file for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document, glove_dict) # print("end file number ", file_counter, " in: ", time.time() - file_start_time) file_counter += 1 total_time = time.time() - start_time indexer.finish_indexing()
def run_engine(config): """ :return: """ number_of_documents = 0 output_path = config.savedFileMainFolder r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) m_Indexer = Indexer(output_path) parquetPaths = [] for (dirPath, dirNames, fileNames) in os.walk(config.get__corpusPath()): for fileName in fileNames: parquetPaths.append((dirPath + '\\' + fileName)) for i in range(len(parquetPaths)): parquetPaths[i] = parquetPaths[i][parquetPaths[i].find('\\') + 1:] if ".DS_Store" in parquetPaths[i]: continue parquet = r.read_file(file_name=parquetPaths[i]) for document in parquet: number_of_documents += 1 parsed_document = p.parse_doc(document) # index the document data m_Indexer.add_new_doc(parsed_document) # if there's more postings to flush, do it. if len(m_Indexer.postingDictionary) > 0: utils.save_obj(m_Indexer.postingDictionary, m_Indexer.postingsPath + '/' + str(m_Indexer.pkl_key)) # Clear single terms and entities, updated inverted index to disk. clearSingleEntities(m_Indexer.inverted_idx, p, output_path, m_Indexer.num_of_docs_in_corpus) utils.save_obj(m_Indexer.inverted_idx, output_path + '/inverted_idx') m_Indexer.inverted_idx.clear() utils.save_obj(number_of_documents, output_path + '/PostingFiles/num_of_docs_in_corpus')
def run_engine(config): """ :param config: :return: """ number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config) paruet_list = r.read_all_parquet() for list in paruet_list: #for i in tqdm(range(0,len(list))): # for every doc for i in range(0, len(list)): # for every doc # parse the document parsed_document = p.parse_doc(list[i]) if parsed_document is None: continue number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) #print('Finished parsing and indexing. Starting to export files') indexer.save_postings() # saves the remaining posting file . PostingsMerge(indexer).chunks_merging() utils.save_dict_as_pickle(indexer.inverted_idx, "inverted_idx", config.get_out_path())
def run_engine(corpus_path='', output_path='', stemming=False): """ :return: """ # Create PostingFile directory if it doesn't exist number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=corpus_path) p = Parse(stemming) indexer = Indexer(config, output_path) # Get all parquet files from corpus path parquets = [] for root, dirs, files in os.walk(corpus_path): for name in files: if name.endswith((".parquet", ".htm")): parquets.append((root, name)) for index in range(len(parquets)): r.corpus_path = parquets[index][0] documents_list = r.read_file(file_name=parquets[index][1]) # Create a new process for each document with Pool(CPUCOUNT) as _p: for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list): number_of_documents += 1 indexer.add_new_doc(parsed_doc) _p.close() _p.join() p.entities.clear() indexer.finish_index() save_obj(indexer.term_dict, output_path + '/' + "inverted_idx") save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary") indexer.document_dict.clear() indexer.term_dict.clear()
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ number_of_documents = 0 config = ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, p.terms_dic_to_document) # Iterate over every document in the file for i in r.filesPath: documents_list = r.read_file(i) start_time = time.time() for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) # update the number of doc in system number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # print(time.time() - start_time) print('--------------------------') print('Start writing to disk left overs') indexer.save_all_left_overs() print('Finish without waiting ' + str(time.time() - start_time)) print('Start waiting') indexer.wait_untill_all_finish() print('End Waiting') print('Finished writing to disk left overs') print('--------------------------') print('Finished parsing and indexing. Starting to export files') print('Finish all Time ' + str(time.time() - start_time)) utils.save_obj(indexer.inverted_idx, "inverted_idx")
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve, word2vec): """ :return: """ # print("start: ", time.asctime(time.localtime(time.time()))) number_of_documents = 0 num_of_writes = 1 config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, word2vec) # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet') # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool) # Iterate over every document in the file counter = 0 names = r.get_files_names_in_dir() for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) # parse the document if parsed_document == {}: # RT continue number_of_documents += 1 indexer.add_new_doc(parsed_document, num_of_writes) # index the document data counter += 1 if counter >= 500000: write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) counter = 0 # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time()))) num_of_writes += 1 # print('Finished parsing and indexing. Starting to export files') write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) # print("finish parser & index: ", time.asctime(time.localtime(time.time()))) indexer.inverted_idx = { key: val for key, val in indexer.inverted_idx.items() if val != 1 } utils.save_obj(indexer.inverted_idx, "inverted_idx") # print("finish save index: ", time.asctime(time.localtime(time.time()))) return num_of_writes
def run_engine(corpus_path_, output_path_, stemming_): """ :return: """ number_of_documents = 0 config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_) config.corpusPath = corpus_path_ config.savedFileMainFolder=output_path_ r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) pathes = r.get_all_path_of_parquet() length_of_array = len(pathes) iteration = 0 is_stemmer = config.toStem parsed_doc_list = list() for i in range(0, length_of_array): documents_list = r.get_documents(pathes[i][0], pathes[i][0]) for doc, j in zip(documents_list, range(len(documents_list))): parsed_document = p.parse_doc(doc, stemmer=is_stemmer) if parsed_document == None: continue parsed_doc_list.append(parsed_document) number_of_documents += 1 if number_of_documents % 200000 == 0: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) iteration += 1 parsed_doc_list.clear() parsed_doc_list = list() elif j == len(documents_list) - 1 and i == length_of_array - 1: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) parsed_doc_list.clear() parsed_doc_list = list() indexer.merge_posting_file() indexer.merge_two_last_posting_file() indexer.split_posting_file_and_create_inverted_index() indexer.write_inverted_index_to_txt_file() number_of_documents = 0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ config = ConfigClass(corpus_path, output_path, stemming) number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) Parse.stemmer = stemming corpus_list = r.read_corpus() for idx in range(len(corpus_list)): documents_list = r.read_file(file_name=corpus_list[idx], read_corpus=True) for i in tqdm(range(len(documents_list))): parsed_document = p.parse_doc(documents_list[i]) if i == len(documents_list) - 1 and idx == len(corpus_list) - 1: indexer.is_last_doc = True indexer.add_new_doc(parsed_document) number_of_documents += 1 indexer.is_last_doc = False documents_list = [] with open('spell_dict.json', 'w') as f: json.dump(indexer.spell_dict, f) pickle_out = open("docs_dict_and_extras", "wb") pickle.dump(indexer.docs_dict, pickle_out) pickle_out.close() start = time.time() indexer.merge_files() end = time.time() print("merge time was: {}".format(end - start)) utils.save_obj(indexer.inverted_idx, "inverted_index") pickle_out = open("docs_dict_and_extras", "ab") pickle.dump(number_of_documents, pickle_out) pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out) pickle.dump(indexer.dump_path, pickle_out) pickle_out.close()
def run_engine(config): """ :return: """ number_of_documents = 0 sum_of_doc_lengths = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config, glove_dict) # documents_list = r.read_file(file_name=config.get__corpusPath()) parquet_documents_list = r.read_folder(config.get__corpusPath()) for parquet_file in parquet_documents_list: documents_list = r.read_file(file_name=parquet_file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 sum_of_doc_lengths += parsed_document.doc_length # index the document data indexer.add_new_doc(parsed_document) # saves last posting file after indexer has done adding documents. indexer.save_postings() if len(indexer.doc_posting_dict) > 0: indexer.save_doc_posting() utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path()) if len(indexer.document_posting_covid) > 0: indexer.save_doc_covid() indexer.delete_dict_after_saving() # merges posting files. indexer.merge_chunks() utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path()) dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents } utils.save_dict(dits, 'details', config.get_out_path())
def parse_and_index(r, p, config, i): """ This function goes through the entire journey of dealing with an input file. Reading it from disk, parsing it, indexing it and writing the temporary index files to disk. It is reading the ith file from the reader list of files :param r: Reader class, list the files to read and deal with reading them :param p: Parse class, deals with parsing a document :param config: Config class, contains info about stemming and where to save files :param i: index of file to deal with from the entire list of files :return: number of tweets read in the specific file """ start = dt.datetime.now() number_of_documents = 0 #print("task num: {}".format(i)) # obtain relevant tweets list documents_list = r.read_file_at_index(i) indexer = Indexer(config) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # save all the temporary files from indexer in tmp directory saving_dir = config.get_save_files_dir() + "/tmp" if not os.path.exists(saving_dir): os.makedirs(saving_dir) #print('Finished parsing and indexing. Starting to export files. task num {}'.format(i)) utils.save_obj(indexer.inverted_idx, saving_dir + "/inverted_idx_" + str(i)) utils.save_obj(indexer.documentDict, saving_dir + "/document_dict_" + str(i)) utils.save_obj(indexer.entities_idx, saving_dir + "/entities_idx_" + str(i)) dump_postings(i, indexer.postingDict, saving_dir, "postingDict") dump_postings(i, indexer.entities_posting, saving_dir, "entitiesDict") end = dt.datetime.now() total_task_time = (end - start).total_seconds() / 60.0 #print("Task {}, total taks time {} minutes".format(i, total_task_time)) return number_of_documents
def test_add_new_doc(self): config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me' assert indexer.add_new_doc() text = 'i wad born in 2019'
def run_engine(corpus_path=None, output_path=None, stemming=False, lemma=False, queries=None, num_docs_to_retrieve=None): """ :return: """ global config, number_of_documents number_of_documents = 0 config = ConfigClass() config.corpusPath = corpus_path config.set_output_path(output_path) config.toStem = stemming config.toLemm = lemma if os.path.exists(config.get_output_path()): shutil.rmtree(config.get_output_path()) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem, config.toLemm) indexer = Indexer(config) documents_list = [] for root, dirs, files in os.walk(corpus_path): r.set_corpus_path(root) for file in files: if file.endswith(".parquet"): documents_list += r.read_file(file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) documents_list.clear( ) # Finished parsing and indexing all files - need to clean all the used memory indexer.cleanup(number_of_documents)
def run_engine(corpus_path, stemming, output_path): """ :return: """ r = ReadFile(corpus_path) p = Parse(stemming) m = BinaryMemoryPosting(os.path.join(output_path, PostingFile)) indexer = Indexer() max_posting_size = 100000 if os.path.exists(os.path.join(output_path, PostingFile)): os.remove(os.path.join(output_path, PostingFile)) if os.path.exists(InvertedIndexFile + '.pkl'): os.remove(InvertedIndexFile + '.pkl') if not os.path.exists(output_path): os.mkdir(output_path) # Iterate over every document in the file idx = 0 for documents_list in r: step = 1 / len(documents_list) for document in documents_list: parsed_list = p.parse_doc(document) # index the document data indexer.add_new_doc(parsed_list, idx, document[0]) idx += 1 if idx % max_posting_size == 0: m.Save(p.word_dict) r.progressbar.update(step) r.progressbar.close() m.Save(p.word_dict) global_table = utils.load_obj(f'global_table_{stemming}') inv_index = indexer.CreatInvertedIndex(p.word_dict, idx, global_table) m.Merge(inv_index) utils.save_obj(inv_index, InvertedIndexFile)
def run_engine(config): """ :return: """ number_of_documents = 0 if config.toStem: if not os.path.exists(config.savedFileMainFolder + "\\WithStem"): os.makedirs(config.savedFileMainFolder + "\\WithStem") out = config.savedFileMainFolder + "\\WithStem" else: if not os.path.exists(config.savedFileMainFolder + "\\WithoutStem"): os.makedirs(config.savedFileMainFolder + "\\WithoutStem") out = config.savedFileMainFolder + "\\WithoutStem" out += '\\' r = ReadFile(config.corpusPath) p = Parse(config.toStem) indexer = Indexer(config, out) end_of_corpus = False for documents_list in r: for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 if r.queue.empty() and number_of_documents == len(documents_list) - 1: end_of_corpus = True # index the document data indexer.add_new_doc(parsed_document, end_of_corpus) if end_of_corpus: end_of_corpus = False for letter in indexer.ABC_dict: for idx in range(1, (indexer.counter_dict_files[letter]) + 1): indexer.merge_files(indexer.out, letter, letter + str(idx)) os.remove(out + letter + str(idx) + ".pkl") p.remove_uppercase_and_entities(indexer) indexer.sort_tweet_ids() utils.save_obj(indexer.inverted_idx, "inverted_idx")
def run_engine(): """ :return: """ number_of_documents = 0 timer = True config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() #p = Parse(with_stemmer=True) indexer = Indexer(config) data_dir = 'Data' + os.sep + 'Data' npy_dirs = [root for root, dirs, files in os.walk(data_dir)] for dir_path in npy_dirs: files = [ os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if fname.endswith('.parquet') ] for file in files: tweets = r.read_file(file_name=file) start_time = time.perf_counter() documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets) end_time = time.perf_counter() avg_time_per_tweet = (end_time - start_time) / len(tweets) print( f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds' ) start_time = time.perf_counter() for parsed_document in documents_list: indexer.add_new_doc(parsed_document) end_time = time.perf_counter() print( f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds' ) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def run_engine(): """ :return: """ number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
def run_engine(config): """ :return: """ parser = Parse(config) r = ReadFile(corpus_path=config.get__corpusPath()) indexer = Indexer(config) number_of_files = 0 for i, file in enumerate(r.read_corpus()): # Iterate over every document in the file number_of_files += 1 for idx, document in enumerate(file): # parse the document parsed_document = parser.parse_doc(document) indexer.add_new_doc(parsed_document) indexer.check_last() indexer.merge_sort_parallel(3) indexer.calculate_idf(parser.number_of_documents) avg_doc_len = parser.total_len_docs / parser.number_of_documents utils.save_obj(avg_doc_len, config.get_savedFileMainFolder() + "\\data") utils.save_obj(indexer.inverted_idx, config.get_savedFileMainFolder() + "\\inverted_idx") utils.save_obj(indexer.docs_inverted, config.get_savedFileMainFolder() + "\\docs_inverted")
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = GlobalMethod() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data if parsed_document is None: continue self._indexer.add_new_doc(parsed_document) if len(self._indexer.inverted_idx)>100000: self._indexer.sort_100K_inverted_index() self._indexer.add_idf_to_dictionary() print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ if ".pkl" in fn: fn=fn[:-4] self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ # self._model = KeyedVectors.load_word2vec_format('glove.twitter.27B.25d.txt.word2vec', binary=False) pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) # def main(): # config = ConfigClass() # se = SearchEngine(config=config) # r = ReadFile(corpus_path=config.get__corpusPath()) # # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1] # # se.build_index_from_parquet(parquet_file_path) # se.load_index('idx_bench') # query = "trump want to change the world" # num,list = se.search(query) # # for key in dictionary.keys(): # # print('tweet id: {}, score (unique common words with query): {}'.format(key[0], dictionary[key]))
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config # self._parser = Parse() self._parser = Parse(self._config) self._indexer = Indexer(self._config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self.clean() self._indexer.calculate_idf(self._parser.number_of_documents) self._indexer.save_index("idx_bench.pkl") print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) searcher.set_method_type('2') return searcher.search(query) def clean(self): p = 0.0008 num_of_terms = round(p * len(self._indexer.inverted_idx_term)) sorted_index = sorted(self._indexer.inverted_idx_term.items(), key=lambda item: item[1][0], reverse=True) for i in range(num_of_terms): del self._indexer.inverted_idx_term[sorted_index[i][0]] for term in list(self._indexer.inverted_idx_term.keys()): # TODO - make statistics if self._indexer.inverted_idx_term[term][0] <= 1: del self._indexer.inverted_idx_term[term]
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self._method = thesaurus_method() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 doc_len = len(documents_list) for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document, doc_len) # print('Finished parsing and indexing.') # print('Finished marge, start rebuild posting dict') # self._indexer.rebuild_postingDict() self._indexer.rebuild_inverted_index() # print('finished rebuild inverted index') to_save = (self._indexer.inverted_idx, self._indexer.tweet_dict, self._indexer.reversed_inverted_index) utils.save_obj(to_save, 'idx_bench') # TODO: inverted_idx, tweet_dict,reversed_inverted_index, to_save change to None self._indexer.inverted_idx = None self._indexer.tweet_dict = None self._indexer.reversed_inverted_index = None to_save = None # print('Finished rebuild inverted index and build reversed_inverted_index') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model, method=self._method) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ r = ReadFile() df = r.read_file(fn) documents_list = df # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # self._indexer.save_index('idx_bench.pkl') # self._indexer.save_index('inverted_idx.pkl') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) query_as_list = self._parser.parse_sentence(query) add_to_query = {} for q in query_as_list: for syn in wordnet.synsets(q): for lemma in syn.lemmas(): if lemma.name() == q.lower(): continue score = wordnet.synsets(q)[0].wup_similarity(syn) if score is not None and score > 0.8: add_to_query[lemma.name()] = score if len(add_to_query) > 3: add_to_query = sorted(add_to_query.items(), key=lambda item: item[1], reverse=True) query_as_list.extend([add_to_query[0][0], add_to_query[1][0], add_to_query[2][0]]) else: query_as_list.extend(add_to_query) new_query = ' '.join(query_as_list) relevant_docs = searcher.search(new_query) return relevant_docs @property def indexer(self): return self._indexer
class SearchEngine: GLOVE_PATH_SERVER = '../../../../glove.twitter.27B.25d.txt' GLOVE_PATH_LOCAL = '.\model/model.txt' def __init__(self, config=None): self._config = config self._parser = Parse(False) self.reader = ReadFile(corpus_path=config.get__corpusPath()) self._indexer = Indexer(config) self.model = self.initialize_glove_dict() self._indexer.set_glove_dict(self.model) def initialize_glove_dict(self): glove_dict = {} with open(self.GLOVE_PATH_LOCAL, 'r', encoding='utf-8') as f: for line in tqdm(f): values = line.split() word = values[0] vector = np.asarray(values[1:], "float32") glove_dict[word] = vector return glove_dict # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in tqdm(enumerate(documents_list)): # parse the document parsed_document = self._parser.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) tuple_to_save = self._indexer.fix_inverted_index() utils.save_pickle_tuple(tuple_to_save, 'idx_engine1', self._config.get_out_path()) print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_path): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def load_index(self, fn): return self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self._indexer.inverted_idx, self._indexer.document_dict = self.load_index( 'idx_engine1.pkl') searcher = Searcher(self._parser, self._indexer, model=self.model) # TODO check about K query_as_list = self._parser.parse_sentence(query) l_res = searcher.search(query_as_list[0]) t_ids = [tup[1] for tup in l_res] return len(l_res), t_ids
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) # parsed_document = self._parser.parse_doc_del_RT(document) if parsed_document == {}: # RT continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.inverted_idx = { key: val for key, val in self._indexer.inverted_idx.items() if val != 1 } self._indexer.postingDict = { key: val for key, val in self._indexer.postingDict.items() if len(val) != 1 } print('Finished parsing and indexing.') # self._indexer.save_index('idx_bench') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.basic_search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self.map_list = [] self.prec5_list = [] self.prec10_list = [] self.prec50_list = [] self.prec_total_list = [] self.recall_list = [] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ print("\nNow Starting search engine 3") # total_time = datetime.now() df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) # print("len of inverted: ", len(self._indexer.inverted_idx)) # print("len of posting: ", len(self._indexer.postingDict)) # print("len of dataSet: ", len(self._indexer.benchDataSet)) # end_time = datetime.now() # print('\n ------ Time To Retrieve: {}'.format(end_time - total_time), " ------\n") # # print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def run_engine_two(self, fn): self.build_index_from_parquet(fn) queries_path = "data\\queries_train.tsv" all_queries = SearchEngine.query_reader( queries_path)["information_need"] for i, q in enumerate(all_queries): print(q) k, docs = self.search(q) # print(docs[:10]) self.check_engine_quality(i + 1, docs[:300]) print() print("Avg map is :", (sum(self.map_list) / len(self.map_list))) @staticmethod def query_reader(queries_path): data = pd.read_csv(queries_path, sep="\t") return data def get_parser(self): return self._parser def check_engine_quality(self, query_num, list_of_docs): """ :param query_num: :param list_of_docs: :return: no return. prints metrics of the query. precision, recall, map. """ benchmark_path = "data\\benchmark_lbls_train.csv" df = pd.read_csv(benchmark_path) df_prec = df[df['query'] == query_num] df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)] dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict() rmv_lst = [] ranking = [] # Add to list for rank for doc in list_of_docs: try: ranking.append(dict_for_data[int(doc)]) except: rmv_lst.append(doc) for d in rmv_lst: list_of_docs.remove(d) data_df = pd.DataFrame({ 'query': query_num, 'tweet': list_of_docs, 'y_true': ranking }) df_rec = df[df['query'] == query_num] recall_total = len(df_rec[df_rec['y_true'] == 1.0]) # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0])) # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0])) # print("found total of", len(df_prec), "tagged docs") # Calculate and print prec5 = metrics.precision_at_n(data_df, query_num, 5) prec10 = metrics.precision_at_n(data_df, query_num, 10) prec50 = metrics.precision_at_n(data_df, query_num, 50) prec_total = metrics.precision(data_df, True, query_number=query_num) map_of_query = metrics.map(data_df) recall_val = metrics.recall_single(data_df, recall_total, query_num) self.map_list.append(map_of_query) self.prec5_list.append(prec5) self.prec10_list.append(prec10) self.prec50_list.append(prec50) self.prec_total_list.append(prec_total) self.recall_list.append(recall_val) print() print("precision at 5 of query", query_num, "is :", prec5) print("precision at 10 of query", query_num, "is :", prec10) print("precision at 50 of query", query_num, "is :", prec50) print("precision of query", query_num, "is :", prec_total) print("recall of query", query_num, "is :", recall_val) print("map of query", query_num, "is :", map_of_query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config if self._config: if not hasattr(self._config, 'toStem'): self._config.toStem = False if not hasattr(self._config, 'toLemm'): self._config.toLemm = False self._parser = Parse() self._indexer = Indexer(config) self._model = None self.corpus_size = 0 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.save_index( self._config.get_output_path()) # Save the inverted_index to disk self.corpus_size = self._indexer.get_docs_count() self.calculate_doc_weight() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def calculate_doc_weight(self): """ The method calculates the TF-IDF for each document :return: """ inverted_index = self._indexer.inverted_idx docs_index = self._indexer.get_docs_index() for word in inverted_index: for doc_id in self._indexer.get_term_posting_list(word): normalized_term_tf = inverted_index[word]["posting_list"][ doc_id][0] term_df = inverted_index[word]['df'] term_idf = math.log2(self.corpus_size / term_df) # calculate doc's total weight term_weight = normalized_term_tf * term_idf inverted_index[word]["posting_list"][doc_id].append( term_weight) term_weight_squared = math.pow(term_weight, 2) docs_index[doc_id][0] += term_weight_squared docs_index[doc_id][0] = round(docs_index[doc_id][0], 3)
def run_engine(): """ :return: """ number_of_documents = 0 corpus_path = config.get__corpusPath() r = ReadFile(corpus_path) indexer = Indexer(config) p = Parse(config) #reading per folder r.create_files_name_list() files_list = [] # every index contains all tweets per folder for file_name in r.dates_list: tweets_per_date = r.read_file(file_name) files_list.append(tweets_per_date) #print("files_list", len(files_list)) num_of_tweets = 0 for folder_list in files_list: num_of_tweets += len(folder_list) #print("num_of_tweets", num_of_tweets) """#reading per folder r.create_files_name_list() threads = [] for file_name in r.dates_list: t = threading.Thread(target=r.read_file(file_name)) threads.append(t) t.start() print("files_list", r.files_list)""" """counter = 1 procs = [] # Iterate over every folder in the DATA for folder_list in files_list: proc = Process(target=test, args=(folder_list, counter, indexer, number_of_documents,)) procs.append(proc) proc.start() # complete the processes for proc in procs: proc.join() print('Finished parsing and indexing. Starting to export files')""" counter = 1 # Iterate over every folder in the DATA for folder_list in files_list: #print(counter) #print(datetime.now()) # Iterate over every tweet in the folder for idx, tweet in enumerate(folder_list): # parse the tweet parsed_document = p.parse_doc(tweet) number_of_documents += 1 # index the tweet data indexer.add_new_doc(parsed_document, num_of_tweets) #print("number of tweets", number_of_documents) #print(datetime.now()) counter += 1 #print('Finished parsing and indexing. Starting to export files') """#read only one folder documents_list = r.read_file(file_name='') num_indexed = len(documents_list) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document, num_indexed) #print('Finished parsing and indexing. Starting to export files')""" utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.tf_idf_dict, "tf_idf_dict") return indexer.get__lda__()
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ #r = ReadFile(self.config.get__corpusPath()) #reader = ReadFile(fn) #walk_dir = self.config.get__corpusPath() # for root, subdirs, files in os.walk(walk_dir, topdown=True): # for file in files: # files=folder # if file.endswith('.parquet'): #start = time.time() number_of_documents = 0 df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) # print("finish file") #end = time.time() #print(end-start) self._indexer.sum_terms_per_docs(number_of_documents) #self._indexer.load_to_disk() #print('Finished parsing and indexing.') #utils.save_obj(self._indexer.inverted_idx, "inverted_idx") # utils.save_obj(indexer.postingDict, "posting") #utils.save_obj(self._indexer.weight_doc_dict, "weight_doc_dict") # save # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. __slots__ = ['_config', '_indexer', '_parser', '_model', 'searcher', '_run_config', '_config'] def __init__(self, config=None, run_config=None): if not config: config = ConfigClass() if not run_config: run_config = RunConfigClass() self._run_config = run_config self._config = config self._parser = Parse(run_config) self._indexer = Indexer(run_config) self._model = None self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") # Iterate over every document in the file for document in df.values: # parse the document parsed_list = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_list) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn.strip('.pkl')) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relevant and the last is the least relevant result. """ return self.searcher.search(query, None, {1})
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.add_square_Wij() to_Save=(self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc,self._indexer.doc_info) utils.save_obj(to_Save, "index_4") print('Finished parsing and indexing.') #print(sorted( self._indexer.inverted_idx,key=lambda x: self._indexer.inverted_idx[x])) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ obj = utils.load_obj(fn) self._indexer.inverted_idx = obj[0] self._indexer.postingDict = obj[1] self._indexer.num_of_docs = obj[2] self._indexer.avg_Size_doc = obj[3] self._indexer.doc_info = obj[4] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self,model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query,k=2000): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer) return searcher.search(query,k) def main(self,output_path,stemming,query_to_check,num_docs_to_retrieve): self.build_index_from_parquet("data/benchmark_data_train.snappy.parquet") if isinstance(query_to_check, list): queries = query_to_check elif isinstance(query_to_check, str): if query_to_check.endswith(".txt"): try: with open(query_to_check, "r",encoding="utf-8") as queries: queries = queries.readlines() query2 = [] for q in queries: if (q != "\n"): query2.append(q) queries=query2 except FileNotFoundError as e: print(e) else: queries = [query_to_check] else: return if (stemming): output_path = output_path + "/WithStem" else: output_path = output_path + "/WithoutStem" query_num = 1 queries = pd.read_csv(os.path.join('data', 'queries_train.tsv'), sep='\t') for i, row in queries.iterrows(): q_id = row['query_id'] q_keywords = row['keywords'] start = time.time() mylist = self.search(q_keywords, num_docs_to_retrieve) answer_to_run = mylist[1] for doc_tuple in answer_to_run: print('tweet id: {}'.format(doc_tuple)) query_num += 1 print("time that toke to retrieve :" + str(time.time() - start))
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 tweet_dic = {} for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) tweet_dic[parsed_document.tweet_id] = [ key for key in parsed_document.term_doc_dictionary.keys() ] number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # self._indexer.save_index("idx_bench.pkl") # # indexer_dic = utils.load_obj("idx_bench") self._indexer.save_index("idx.pkl") # TODO - we need submit this indexer_dic = utils.load_obj("idx") # TODO - we need submit this localMethod = False word2vec = True globalMethod = False wordNet = False spellChecker = False if localMethod: indexer_dic["local"] = True if word2vec: indexer_dic["word2vec"] = True indexer_dic["tweet_dic"] = tweet_dic if wordNet: indexer_dic["wordnet"] = True if spellChecker: indexer_dic["spellChecker"] = True if globalMethod: docs_dic, Sij_dic = compute_Wi(indexer_dic, globalMethod) indexer_dic["docs"] = docs_dic indexer_dic["global"] = Sij_dic else: docs_dic = compute_Wi(indexer_dic) indexer_dic["docs"] = docs_dic # utils.save_obj(indexer_dic, "idx_bench") utils.save_obj(indexer_dic, "idx") # TODO - we need submit this # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ self._model = gensim.models.KeyedVectors.load_word2vec_format( os.path.join(model_dir, 'trained_Word2Vec'), binary=True, encoding='utf-8', unicode_errors='ignore') self._config.set_download_model(False) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)