def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None): """ This function build the inverted index over the corpus. send each tweet to parsing and indexing. if the stemming is True the parsing will use the stemmer on the tokens. :param glove_dict: Glove file including all word vectors :param corpus_path: root folder containing the raw tweet files :param output_path for the inverted index, posting files and tweets dictionary :param stemming if True use stemmer on terms """ config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path) r = ReadFile(corpus_path=config.get_corpusPath()) p = Parse(stemming) indexer = Indexer(config) all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet") all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths] start_time = time.time() file_counter = 0 for file_name in all_files_names: file_start_time = time.time() # print("start file :", file_counter) documents_list = [document for document in r.read_file(file_name=file_name)] # Iterate over every document in the file for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document, glove_dict) # print("end file number ", file_counter, " in: ", time.time() - file_start_time) file_counter += 1 total_time = time.time() - start_time indexer.finish_indexing()
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ number_of_documents = 0 config = ConfigClass(corpus_path, output_path, stemming) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, p.terms_dic_to_document) # Iterate over every document in the file for i in r.filesPath: documents_list = r.read_file(i) start_time = time.time() for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) # update the number of doc in system number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # print(time.time() - start_time) print('--------------------------') print('Start writing to disk left overs') indexer.save_all_left_overs() print('Finish without waiting ' + str(time.time() - start_time)) print('Start waiting') indexer.wait_untill_all_finish() print('End Waiting') print('Finished writing to disk left overs') print('--------------------------') print('Finished parsing and indexing. Starting to export files') print('Finish all Time ' + str(time.time() - start_time)) utils.save_obj(indexer.inverted_idx, "inverted_idx")
def run_engine(config): """ :return: """ number_of_documents = 0 output_path = config.savedFileMainFolder r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) m_Indexer = Indexer(output_path) parquetPaths = [] for (dirPath, dirNames, fileNames) in os.walk(config.get__corpusPath()): for fileName in fileNames: parquetPaths.append((dirPath + '\\' + fileName)) for i in range(len(parquetPaths)): parquetPaths[i] = parquetPaths[i][parquetPaths[i].find('\\') + 1:] if ".DS_Store" in parquetPaths[i]: continue parquet = r.read_file(file_name=parquetPaths[i]) for document in parquet: number_of_documents += 1 parsed_document = p.parse_doc(document) # index the document data m_Indexer.add_new_doc(parsed_document) # if there's more postings to flush, do it. if len(m_Indexer.postingDictionary) > 0: utils.save_obj(m_Indexer.postingDictionary, m_Indexer.postingsPath + '/' + str(m_Indexer.pkl_key)) # Clear single terms and entities, updated inverted index to disk. clearSingleEntities(m_Indexer.inverted_idx, p, output_path, m_Indexer.num_of_docs_in_corpus) utils.save_obj(m_Indexer.inverted_idx, output_path + '/inverted_idx') m_Indexer.inverted_idx.clear() utils.save_obj(number_of_documents, output_path + '/PostingFiles/num_of_docs_in_corpus')
def run_engine(config): """ :param config: :return: """ number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config) paruet_list = r.read_all_parquet() for list in paruet_list: #for i in tqdm(range(0,len(list))): # for every doc for i in range(0, len(list)): # for every doc # parse the document parsed_document = p.parse_doc(list[i]) if parsed_document is None: continue number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) #print('Finished parsing and indexing. Starting to export files') indexer.save_postings() # saves the remaining posting file . PostingsMerge(indexer).chunks_merging() utils.save_dict_as_pickle(indexer.inverted_idx, "inverted_idx", config.get_out_path())
def task(self, queue, document_list): parser = Parse() indexer = Indexer(self._config) for idx, document in enumerate(document_list): # parse the document parsed_document = parser.parse_doc(document) # index the document data indexer.add_new_doc(parsed_document) queue.put(indexer.get_inverted_index())
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve, word2vec): """ :return: """ # print("start: ", time.asctime(time.localtime(time.time()))) number_of_documents = 0 num_of_writes = 1 config = ConfigClass(corpus_path) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(stemming) indexer = Indexer(config, word2vec) # documents_list = r.read_file(file_name='covid19_07-30.snappy.parquet') # TODO - handel all files ~50 (can do with from multiprocessing.pool import ThreadPool) # Iterate over every document in the file counter = 0 names = r.get_files_names_in_dir() for name in names: documents_list = r.read_file_by_name(file_name=str(name)) for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) # parse the document if parsed_document == {}: # RT continue number_of_documents += 1 indexer.add_new_doc(parsed_document, num_of_writes) # index the document data counter += 1 if counter >= 500000: write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) counter = 0 # print("finish parser & index number: ", num_of_writes, " At: ", time.asctime(time.localtime(time.time()))) num_of_writes += 1 # print('Finished parsing and indexing. Starting to export files') write_and_clean_buffer(indexer, num_of_writes, stemming, config, output_path) # print("finish parser & index: ", time.asctime(time.localtime(time.time()))) indexer.inverted_idx = { key: val for key, val in indexer.inverted_idx.items() if val != 1 } utils.save_obj(indexer.inverted_idx, "inverted_idx") # print("finish save index: ", time.asctime(time.localtime(time.time()))) return num_of_writes
def test(folder_list, counter, indexer, number_of_documents): print(counter) cr = datetime.now() print(cr) p = Parse(config) # Iterate over every tweet in the folder for idx, tweet in enumerate(folder_list): # parse the tweet parsed_document = p.parse_doc(tweet) number_of_documents += 1 # index the tweet data indexer.add_new_doc(parsed_document) print("number of tweets", number_of_documents) cn = datetime.now() print(cn) counter += 1
def run_engine(config, indexer): """ :return: """ number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config) doc = r.read_file('benchmark_data_train.snappy.parquet') for document in doc: parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document) number_of_documents += 1 capital_letters = p.caps_dict indexer.change_inverted_by_caps(capital_letters) indexer.save_index('idx_bench')
def run_engine(corpus_path_, output_path_, stemming_): """ :return: """ number_of_documents = 0 config = ConfigClass(corpuspath=corpus_path_,outputpath=output_path_,stemming=stemming_) config.corpusPath = corpus_path_ config.savedFileMainFolder=output_path_ r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) pathes = r.get_all_path_of_parquet() length_of_array = len(pathes) iteration = 0 is_stemmer = config.toStem parsed_doc_list = list() for i in range(0, length_of_array): documents_list = r.get_documents(pathes[i][0], pathes[i][0]) for doc, j in zip(documents_list, range(len(documents_list))): parsed_document = p.parse_doc(doc, stemmer=is_stemmer) if parsed_document == None: continue parsed_doc_list.append(parsed_document) number_of_documents += 1 if number_of_documents % 200000 == 0: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) iteration += 1 parsed_doc_list.clear() parsed_doc_list = list() elif j == len(documents_list) - 1 and i == length_of_array - 1: for doc in parsed_doc_list: indexer.add_new_doc(doc) indexer.write_posting_to_txt_file_lower_upper(iteration) parsed_doc_list.clear() parsed_doc_list = list() indexer.merge_posting_file() indexer.merge_two_last_posting_file() indexer.split_posting_file_and_create_inverted_index() indexer.write_inverted_index_to_txt_file() number_of_documents = 0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve): """ :return: """ config = ConfigClass(corpus_path, output_path, stemming) number_of_documents = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) Parse.stemmer = stemming corpus_list = r.read_corpus() for idx in range(len(corpus_list)): documents_list = r.read_file(file_name=corpus_list[idx], read_corpus=True) for i in tqdm(range(len(documents_list))): parsed_document = p.parse_doc(documents_list[i]) if i == len(documents_list) - 1 and idx == len(corpus_list) - 1: indexer.is_last_doc = True indexer.add_new_doc(parsed_document) number_of_documents += 1 indexer.is_last_doc = False documents_list = [] with open('spell_dict.json', 'w') as f: json.dump(indexer.spell_dict, f) pickle_out = open("docs_dict_and_extras", "wb") pickle.dump(indexer.docs_dict, pickle_out) pickle_out.close() start = time.time() indexer.merge_files() end = time.time() print("merge time was: {}".format(end - start)) utils.save_obj(indexer.inverted_idx, "inverted_index") pickle_out = open("docs_dict_and_extras", "ab") pickle.dump(number_of_documents, pickle_out) pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out) pickle.dump(indexer.dump_path, pickle_out) pickle_out.close()
def create_table(stemming, corpus): myclient = pymongo.MongoClient("mongodb://localhost:27017/") mydb = myclient["mydatabase"] mycol = mydb["global"] mycol.drop() r = ReadFile(corpus) p = Parse(stemming) for documents_list in r: step = 1 / len(documents_list) for document in documents_list: parsed_list = [t.text.lower() for t in p.parse_doc(document) if '$' not in t.text] for word_1 in parsed_list: query = {'term': word_1} row = mycol.find_one(query) if not row: mycol.insert_one({**query, 'terms': {}}) row = mycol.find_one(query) for word_2 in parsed_list: if word_2 not in row['terms'].keys(): row['terms'][word_2] = 0 row['terms'][word_2] += 1 try: mycol.update_one(query, {"$set": {'terms': row['terms']}}) except: print(row['terms']) r.progressbar.update(step) counter += 1 global_table = {} for word_1 in mycol.find(): top = [] for word_2 in word_1['terms'].keys(): s = word_1['terms'][word_2] / ( word_1['terms'][word_1['term']] + mycol.find_one({'term': word_2})['terms'][word_1['term']] - word_1['terms'][word_2]) if len(top) < 10: top.append((word_2, s)) top.sort(key=lambda score: score[1]) elif s > top[0][1]: top[0] = (word_2, s) top.sort(key=lambda score: score[1]) global_table[word_1['term']] = top utils.save_obj(global_table, f'global_table_{stemming}')
def run_engine(config): """ :return: """ number_of_documents = 0 sum_of_doc_lengths = 0 r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem) indexer = Indexer(config, glove_dict) # documents_list = r.read_file(file_name=config.get__corpusPath()) parquet_documents_list = r.read_folder(config.get__corpusPath()) for parquet_file in parquet_documents_list: documents_list = r.read_file(file_name=parquet_file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 sum_of_doc_lengths += parsed_document.doc_length # index the document data indexer.add_new_doc(parsed_document) # saves last posting file after indexer has done adding documents. indexer.save_postings() if len(indexer.doc_posting_dict) > 0: indexer.save_doc_posting() utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path()) if len(indexer.document_posting_covid) > 0: indexer.save_doc_covid() indexer.delete_dict_after_saving() # merges posting files. indexer.merge_chunks() utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path()) dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents } utils.save_dict(dits, 'details', config.get_out_path())
def run_engine(corpus_path=None, output_path=None, stemming=False, lemma=False, queries=None, num_docs_to_retrieve=None): """ :return: """ global config, number_of_documents number_of_documents = 0 config = ConfigClass() config.corpusPath = corpus_path config.set_output_path(output_path) config.toStem = stemming config.toLemm = lemma if os.path.exists(config.get_output_path()): shutil.rmtree(config.get_output_path()) r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse(config.toStem, config.toLemm) indexer = Indexer(config) documents_list = [] for root, dirs, files in os.walk(corpus_path): r.set_corpus_path(root) for file in files: if file.endswith(".parquet"): documents_list += r.read_file(file) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) documents_list.clear( ) # Finished parsing and indexing all files - need to clean all the used memory indexer.cleanup(number_of_documents)
def run_engine(config): """ :return: """ number_of_documents = 0 if config.toStem: if not os.path.exists(config.savedFileMainFolder + "\\WithStem"): os.makedirs(config.savedFileMainFolder + "\\WithStem") out = config.savedFileMainFolder + "\\WithStem" else: if not os.path.exists(config.savedFileMainFolder + "\\WithoutStem"): os.makedirs(config.savedFileMainFolder + "\\WithoutStem") out = config.savedFileMainFolder + "\\WithoutStem" out += '\\' r = ReadFile(config.corpusPath) p = Parse(config.toStem) indexer = Indexer(config, out) end_of_corpus = False for documents_list in r: for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 if r.queue.empty() and number_of_documents == len(documents_list) - 1: end_of_corpus = True # index the document data indexer.add_new_doc(parsed_document, end_of_corpus) if end_of_corpus: end_of_corpus = False for letter in indexer.ABC_dict: for idx in range(1, (indexer.counter_dict_files[letter]) + 1): indexer.merge_files(indexer.out, letter, letter + str(idx)) os.remove(out + letter + str(idx) + ".pkl") p.remove_uppercase_and_entities(indexer) indexer.sort_tweet_ids() utils.save_obj(indexer.inverted_idx, "inverted_idx")
def run_engine(corpus_path, stemming, output_path): """ :return: """ r = ReadFile(corpus_path) p = Parse(stemming) m = BinaryMemoryPosting(os.path.join(output_path, PostingFile)) indexer = Indexer() max_posting_size = 100000 if os.path.exists(os.path.join(output_path, PostingFile)): os.remove(os.path.join(output_path, PostingFile)) if os.path.exists(InvertedIndexFile + '.pkl'): os.remove(InvertedIndexFile + '.pkl') if not os.path.exists(output_path): os.mkdir(output_path) # Iterate over every document in the file idx = 0 for documents_list in r: step = 1 / len(documents_list) for document in documents_list: parsed_list = p.parse_doc(document) # index the document data indexer.add_new_doc(parsed_list, idx, document[0]) idx += 1 if idx % max_posting_size == 0: m.Save(p.word_dict) r.progressbar.update(step) r.progressbar.close() m.Save(p.word_dict) global_table = utils.load_obj(f'global_table_{stemming}') inv_index = indexer.CreatInvertedIndex(p.word_dict, idx, global_table) m.Merge(inv_index) utils.save_obj(inv_index, InvertedIndexFile)
def run_engine(config): """ :return: """ parser = Parse(config) r = ReadFile(corpus_path=config.get__corpusPath()) indexer = Indexer(config) number_of_files = 0 for i, file in enumerate(r.read_corpus()): # Iterate over every document in the file number_of_files += 1 for idx, document in enumerate(file): # parse the document parsed_document = parser.parse_doc(document) indexer.add_new_doc(parsed_document) indexer.check_last() indexer.merge_sort_parallel(3) indexer.calculate_idf(parser.number_of_documents) avg_doc_len = parser.total_len_docs / parser.number_of_documents utils.save_obj(avg_doc_len, config.get_savedFileMainFolder() + "\\data") utils.save_obj(indexer.inverted_idx, config.get_savedFileMainFolder() + "\\inverted_idx") utils.save_obj(indexer.docs_inverted, config.get_savedFileMainFolder() + "\\docs_inverted")
def run_engine(): """ :return: """ number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=config.get__corpusPath()) p = Parse() indexer = Indexer(config) documents_list = r.read_file(file_name='sample3.parquet') # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) print('Finished parsing and indexing. Starting to export files') utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.postingDict, "posting")
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config # self._parser = Parse() self._parser = Parse(self._config) self._indexer = Indexer(self._config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self.clean() self._indexer.calculate_idf(self._parser.number_of_documents) self._indexer.save_index("idx_bench.pkl") print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) searcher.set_method_type('2') return searcher.search(query) def clean(self): p = 0.0008 num_of_terms = round(p * len(self._indexer.inverted_idx_term)) sorted_index = sorted(self._indexer.inverted_idx_term.items(), key=lambda item: item[1][0], reverse=True) for i in range(num_of_terms): del self._indexer.inverted_idx_term[sorted_index[i][0]] for term in list(self._indexer.inverted_idx_term.keys()): # TODO - make statistics if self._indexer.inverted_idx_term[term][0] <= 1: del self._indexer.inverted_idx_term[term]
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self._method = thesaurus_method() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 doc_len = len(documents_list) for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document, doc_len) # print('Finished parsing and indexing.') # print('Finished marge, start rebuild posting dict') # self._indexer.rebuild_postingDict() self._indexer.rebuild_inverted_index() # print('finished rebuild inverted index') to_save = (self._indexer.inverted_idx, self._indexer.tweet_dict, self._indexer.reversed_inverted_index) utils.save_obj(to_save, 'idx_bench') # TODO: inverted_idx, tweet_dict,reversed_inverted_index, to_save change to None self._indexer.inverted_idx = None self._indexer.tweet_dict = None self._indexer.reversed_inverted_index = None to_save = None # print('Finished rebuild inverted index and build reversed_inverted_index') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model, method=self._method) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ r = ReadFile() df = r.read_file(fn) documents_list = df # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # self._indexer.save_index('idx_bench.pkl') # self._indexer.save_index('inverted_idx.pkl') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) query_as_list = self._parser.parse_sentence(query) add_to_query = {} for q in query_as_list: for syn in wordnet.synsets(q): for lemma in syn.lemmas(): if lemma.name() == q.lower(): continue score = wordnet.synsets(q)[0].wup_similarity(syn) if score is not None and score > 0.8: add_to_query[lemma.name()] = score if len(add_to_query) > 3: add_to_query = sorted(add_to_query.items(), key=lambda item: item[1], reverse=True) query_as_list.extend([add_to_query[0][0], add_to_query[1][0], add_to_query[2][0]]) else: query_as_list.extend(add_to_query) new_query = ' '.join(query_as_list) relevant_docs = searcher.search(new_query) return relevant_docs @property def indexer(self): return self._indexer
class SearchEngine: GLOVE_PATH_SERVER = '../../../../glove.twitter.27B.25d.txt' GLOVE_PATH_LOCAL = '.\model/model.txt' def __init__(self, config=None): self._config = config self._parser = Parse(False) self.reader = ReadFile(corpus_path=config.get__corpusPath()) self._indexer = Indexer(config) self.model = self.initialize_glove_dict() self._indexer.set_glove_dict(self.model) def initialize_glove_dict(self): glove_dict = {} with open(self.GLOVE_PATH_LOCAL, 'r', encoding='utf-8') as f: for line in tqdm(f): values = line.split() word = values[0] vector = np.asarray(values[1:], "float32") glove_dict[word] = vector return glove_dict # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in tqdm(enumerate(documents_list)): # parse the document parsed_document = self._parser.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) tuple_to_save = self._indexer.fix_inverted_index() utils.save_pickle_tuple(tuple_to_save, 'idx_engine1', self._config.get_out_path()) print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_path): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def load_index(self, fn): return self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self._indexer.inverted_idx, self._indexer.document_dict = self.load_index( 'idx_engine1.pkl') searcher = Searcher(self._parser, self._indexer, model=self.model) # TODO check about K query_as_list = self._parser.parse_sentence(query) l_res = searcher.search(query_as_list[0]) t_ids = [tup[1] for tup in l_res] return len(l_res), t_ids
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # self._indexer.save_index("idx_bench.pkl") # # indexer_dic = utils.load_obj("idx_bench") self._indexer.save_index("idx.pkl") # TODO - we need submit this indexer_dic = utils.load_obj("idx") # TODO - we need submit this localMethod = False globalMethod = False wordNet = True spellChecker = False if localMethod: indexer_dic["local"] = True if wordNet: indexer_dic["wordnet"] = True if spellChecker: indexer_dic["spellChecker"] = True if globalMethod: docs_dic, Sij_dic = compute_Wi(indexer_dic, globalMethod) indexer_dic["docs"] = docs_dic indexer_dic["global"] = Sij_dic else: docs_dic = compute_Wi(indexer_dic) indexer_dic["docs"] = docs_dic # utils.save_obj(indexer_dic, "idx_bench") utils.save_obj(indexer_dic, "idx") # TODO - we need submit this # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self.map_list = [] self.prec5_list = [] self.prec10_list = [] self.prec50_list = [] self.prec_total_list = [] self.recall_list = [] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ print("\nNow Starting search engine 3") # total_time = datetime.now() df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) # print("len of inverted: ", len(self._indexer.inverted_idx)) # print("len of posting: ", len(self._indexer.postingDict)) # print("len of dataSet: ", len(self._indexer.benchDataSet)) # end_time = datetime.now() # print('\n ------ Time To Retrieve: {}'.format(end_time - total_time), " ------\n") # # print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def run_engine_two(self, fn): self.build_index_from_parquet(fn) queries_path = "data\\queries_train.tsv" all_queries = SearchEngine.query_reader( queries_path)["information_need"] for i, q in enumerate(all_queries): print(q) k, docs = self.search(q) # print(docs[:10]) self.check_engine_quality(i + 1, docs[:300]) print() print("Avg map is :", (sum(self.map_list) / len(self.map_list))) @staticmethod def query_reader(queries_path): data = pd.read_csv(queries_path, sep="\t") return data def get_parser(self): return self._parser def check_engine_quality(self, query_num, list_of_docs): """ :param query_num: :param list_of_docs: :return: no return. prints metrics of the query. precision, recall, map. """ benchmark_path = "data\\benchmark_lbls_train.csv" df = pd.read_csv(benchmark_path) df_prec = df[df['query'] == query_num] df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)] dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict() rmv_lst = [] ranking = [] # Add to list for rank for doc in list_of_docs: try: ranking.append(dict_for_data[int(doc)]) except: rmv_lst.append(doc) for d in rmv_lst: list_of_docs.remove(d) data_df = pd.DataFrame({ 'query': query_num, 'tweet': list_of_docs, 'y_true': ranking }) df_rec = df[df['query'] == query_num] recall_total = len(df_rec[df_rec['y_true'] == 1.0]) # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0])) # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0])) # print("found total of", len(df_prec), "tagged docs") # Calculate and print prec5 = metrics.precision_at_n(data_df, query_num, 5) prec10 = metrics.precision_at_n(data_df, query_num, 10) prec50 = metrics.precision_at_n(data_df, query_num, 50) prec_total = metrics.precision(data_df, True, query_number=query_num) map_of_query = metrics.map(data_df) recall_val = metrics.recall_single(data_df, recall_total, query_num) self.map_list.append(map_of_query) self.prec5_list.append(prec5) self.prec10_list.append(prec10) self.prec50_list.append(prec50) self.prec_total_list.append(prec_total) self.recall_list.append(recall_val) print() print("precision at 5 of query", query_num, "is :", prec5) print("precision at 10 of query", query_num, "is :", prec10) print("precision at 50 of query", query_num, "is :", prec50) print("precision of query", query_num, "is :", prec_total) print("recall of query", query_num, "is :", recall_val) print("map of query", query_num, "is :", map_of_query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config if self._config: if not hasattr(self._config, 'toStem'): self._config.toStem = False if not hasattr(self._config, 'toLemm'): self._config.toLemm = False self._parser = Parse() self._indexer = Indexer(config) self._model = None self.corpus_size = 0 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.save_index( self._config.get_output_path()) # Save the inverted_index to disk self.corpus_size = self._indexer.get_docs_count() self.calculate_doc_weight() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def calculate_doc_weight(self): """ The method calculates the TF-IDF for each document :return: """ inverted_index = self._indexer.inverted_idx docs_index = self._indexer.get_docs_index() for word in inverted_index: for doc_id in self._indexer.get_term_posting_list(word): normalized_term_tf = inverted_index[word]["posting_list"][ doc_id][0] term_df = inverted_index[word]['df'] term_idf = math.log2(self.corpus_size / term_df) # calculate doc's total weight term_weight = normalized_term_tf * term_idf inverted_index[word]["posting_list"][doc_id].append( term_weight) term_weight_squared = math.pow(term_weight, 2) docs_index[doc_id][0] += term_weight_squared docs_index[doc_id][0] = round(docs_index[doc_id][0], 3)
(document.tweet_id, document_dictionary[term])) number_arr[0] += 1 except: print('INVERTED: problem with the following key {}'.format( term[0])) max_freq = max([document_dictionary.values()]) self.tmp_pos_doc[document.tweet_id] = document_dictionary self.num_in_pos_doc[0] += 1 if self.num_in_pos_doc[0] >= self.avg_length: if 'doc' not in self.set_is_writting.keys(): self.map_reduce_doc.write_dict(self.tmp_pos_doc) self.set_is_writting['doc'] = 1 else: self.map_reduce_doc.wait_untill_finish() del self.set_is_writting['doc'] self.num_in_pos_doc[0] = 0 if __name__ == '__main__': p = Parse(True) parsed_document = p.parse_doc([ '1280914835979501568', 'Wed Jul 08 17:21:09 +0000 2020', '70% @loganxtalor: Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a bind. I…', '{}', '[]', 'Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a… https://t.co/i8IdrIKp2B', '{"https://t.co/i8IdrIKp2B":"https://twitter.com/i/web/status/1280659984628490246"}', '[[116,139]]', None, None, None, None, None, None ]) i = Indexer() i.add_new_doc(parsed_document)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = GlobalMethod() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data if parsed_document is None: continue self._indexer.add_new_doc(parsed_document) if len(self._indexer.inverted_idx)>100000: self._indexer.sort_100K_inverted_index() self._indexer.add_idf_to_dictionary() print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ if ".pkl" in fn: fn=fn[:-4] self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ # self._model = KeyedVectors.load_word2vec_format('glove.twitter.27B.25d.txt.word2vec', binary=False) pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) # def main(): # config = ConfigClass() # se = SearchEngine(config=config) # r = ReadFile(corpus_path=config.get__corpusPath()) # # parquet_file_path =r.get_all_path_of_parquet()[0][0]+r.get_all_path_of_parquet()[0][1] # # se.build_index_from_parquet(parquet_file_path) # se.load_index('idx_bench') # query = "trump want to change the world" # num,list = se.search(query) # # for key in dictionary.keys(): # # print('tweet id: {}, score (unique common words with query): {}'.format(key[0], dictionary[key]))
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. __slots__ = ['_config', '_indexer', '_parser', '_model', 'searcher', '_run_config', '_config'] def __init__(self, config=None, run_config=None): if not config: config = ConfigClass() if not run_config: run_config = RunConfigClass() self._run_config = run_config self._config = config self._parser = Parse(run_config) self._indexer = Indexer(run_config) self._model = None self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") # Iterate over every document in the file for document in df.values: # parse the document parsed_list = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_list) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn.strip('.pkl')) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relevant and the last is the least relevant result. """ return self.searcher.search(query, None, {1})
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) if parsed_document == {}: # RT continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.inverted_idx = { key: val for key, val in self._indexer.inverted_idx.items() if val != 1 } self._indexer.postingDict = { key: val for key, val in self._indexer.postingDict.items() if len(val) != 1 } print('Finished parsing and indexing.') # self._indexer.save_index('idx_bench') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ filename = self._config.glove_twitter_27B_25d_path word2vec_output_file = 'glove.twitter.27B.25d.txt.word2vec' glove2word2vec(filename, word2vec_output_file) filename = word2vec_output_file self._model = gensim.models.KeyedVectors.load_word2vec_format( filename, binary=False) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
def run_engine(): """ :return: """ number_of_documents = 0 corpus_path = config.get__corpusPath() r = ReadFile(corpus_path) indexer = Indexer(config) p = Parse(config) #reading per folder r.create_files_name_list() files_list = [] # every index contains all tweets per folder for file_name in r.dates_list: tweets_per_date = r.read_file(file_name) files_list.append(tweets_per_date) #print("files_list", len(files_list)) num_of_tweets = 0 for folder_list in files_list: num_of_tweets += len(folder_list) #print("num_of_tweets", num_of_tweets) """#reading per folder r.create_files_name_list() threads = [] for file_name in r.dates_list: t = threading.Thread(target=r.read_file(file_name)) threads.append(t) t.start() print("files_list", r.files_list)""" """counter = 1 procs = [] # Iterate over every folder in the DATA for folder_list in files_list: proc = Process(target=test, args=(folder_list, counter, indexer, number_of_documents,)) procs.append(proc) proc.start() # complete the processes for proc in procs: proc.join() print('Finished parsing and indexing. Starting to export files')""" counter = 1 # Iterate over every folder in the DATA for folder_list in files_list: #print(counter) #print(datetime.now()) # Iterate over every tweet in the folder for idx, tweet in enumerate(folder_list): # parse the tweet parsed_document = p.parse_doc(tweet) number_of_documents += 1 # index the tweet data indexer.add_new_doc(parsed_document, num_of_tweets) #print("number of tweets", number_of_documents) #print(datetime.now()) counter += 1 #print('Finished parsing and indexing. Starting to export files') """#read only one folder documents_list = r.read_file(file_name='') num_indexed = len(documents_list) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = p.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document, num_indexed) #print('Finished parsing and indexing. Starting to export files')""" utils.save_obj(indexer.inverted_idx, "inverted_idx") utils.save_obj(indexer.tf_idf_dict, "tf_idf_dict") return indexer.get__lda__()
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse(False) self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ dict_of_methods = {} dict_of_methods['wordnet'] = True dict_of_methods['spell_correction'] = False dict_of_methods['thesaurus'] = False dict_of_methods['word2vec'] = False dict_of_methods['parser'] = False df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') self._indexer.slice_uncommon_terms() self._indexer.calculate_wij_idf() self._indexer.set_dict_methods(dict_of_methods) # self._indexer.save_index(self._config.get_output_path() + 'inverted_idx') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)