class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config if config.toStem: self._parser = Parse_stem() else: self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ config = self._config indexer = self._indexer number_of_documents = 0 if (config.getoneFile()): df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.calculationSummerize() else: r = ReadFile(corpus_path=config.get__corpusPath()) for root, dirs, files in os.walk(config.get__corpusPath(), topdown=True): for name in files: ext = name.split('.')[-1] if ext == 'parquet': documents_list = r.read_folder(root, file_name=name) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # indexer.update_posting_files()-use this function for bog corpuses # indexer.reset_cach() self._indexer.save_index('inverted_idx') print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass #self._model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def get_full_text(self, d_id): return self._indexer.documents_data[d_id][4] def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ if self._indexer.inverted_idx == None: print("can't run query without inverted index been loaded") return searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): if config == None: config = ConfigClass() self._config = config if config.toStem: self._parser = Parse_stem() else: self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ config = self._config indexer = self._indexer number_of_documents = 0 if (config.getoneFile()): df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.calculationSummerize() else: r = ReadFile(corpus_path=config.get__corpusPath()) for root, dirs, files in os.walk(config.get__corpusPath(), topdown=True): for name in files: ext = name.split('.')[-1] if ext == 'parquet': documents_list = r.read_folder(root, file_name=name) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # indexer.update_posting_files() # indexer.reset_cach() self._indexer.save_index('inverted_idx') print('Finished parsing and indexing.') # def get_full_text(self, d_id): # return self._indexer.documents_data[d_id][4] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): #TODO implement """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ self._model = KeyedVectors.load_word2vec_format( 'GoogleNews-vectors-negative300.bin', binary=True) # self._model = KeyedVectors.load_word2vec_format(self._config.google_news_vectors_negative300_path, binary=True) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ if self._indexer.inverted_idx == None: print("can't run query without inverted index been loaded") return searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def main(self, queries=None, num_docs_to_retrieve=None): config = self._config # config.set_corpusPath(corpus_path) # config.set_savedFileMainFolder(output_path) # config.set_toStem(stemming) self.load_precomputed_model() vectorModel = self._model start = timer() print("----started parsing and indexer----") self.build_index_from_parquet( 'data/benchmark_data_train.snappy.parquet') end = timer() print("Process ends..") # print(timedelta(seconds=end - start)) if num_docs_to_retrieve == None: num_docs_to_retrieve = 2000 inverted_index = self.load_index('inverted_idx') if queries == None: #for end users use user_query = input("Please enter a query") user_num = int(input("How many tweets you want to get (maximum)?")) res = self.search(user_query) print("here are the links to the tweets related to use query") for x in range(user_num): try: tweeter_start_link = 'https://twitter.com/IsraelHayomHeb/status/' tweet_id = (res[1][x]) print(tweeter_start_link + tweet_id) except: pass #less than number of doc to retrive found else: #for engenier use import csv with open('queries_output.csv', 'w', newline='') as file: writer = csv.writer(file) writer.writerow(["query", "tweet"]) if not isinstance(queries, list): try: f = open(queries, "r+", encoding='utf-8') # queries = f.read() queries = f.readlines() f.close() except Exception: raise print("fail in reading queries file") # see numbers for the document file i = 0 for querie in queries: print("querie number" + str(i)) print(querie) i += 1 res = self.search(querie) for s in range(num_docs_to_retrieve): try: writer.writerow([i, res[1][s]]) except: pass # less than number of doc to retrive found