class SearchEngine: num_of_tweets = 0 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None def get_num_of_tweets(self): return self.num_of_tweets # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() self.num_of_tweets = len(documents_list) # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) parsed_document.num_of_tweets = self.num_of_tweets number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # TODO: check indexer saving utils.save_obj(self._indexer.inverted_idx, "inverted_idx") # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ inverted_idx = self._indexer.load_index(fn) return inverted_idx # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query, 0) original_query_list = query.split(" ") stop_words = stopwords.words('english') original_query_list = [ w for w in original_query_list if w not in stop_words ] # find long terms and upper case words counter = 0 while counter < len(original_query_list): len_term = 1 word = original_query_list[counter] if word.isupper(): # NBA if word.find("\n") != -1: word = word[:-1] if word.find(".") != -1: word = word[:-1] query_as_list.append(word) elif len(word) > 1 and re.search( '[a-zA-Z]', word) and word[0].isupper(): # upper first char term = word if original_query_list.index(word) + 1 < len( original_query_list): index = original_query_list.index(word) + 1 while index < len(original_query_list): # find all term if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \ original_query_list[index][0].isupper(): new_word2 = original_query_list[index][ 0] + original_query_list[index][1:].lower( ) # Donald Trump term += " " + new_word2 index += 1 len_term += 1 else: break if len_term > 1: query_as_list.append(term) counter += len_term spell_checker = SpellChecker_ranker.correct_query(query_as_list) searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(spell_checker) # TODO: add K results
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. #using wordNet to expand queries def __init__(self, config=None): self._config = config if config.toStem: self._parser = Parse_stem() else: self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ config = self._config indexer = self._indexer number_of_documents = 0 if(config.getoneFile()): df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.calculationSummerize() else: r = ReadFile(corpus_path=config.get__corpusPath()) for root, dirs, files in os.walk(config.get__corpusPath(), topdown=True): for name in files: ext = name.split('.')[-1] if ext == 'parquet': documents_list = r.read_folder(root, file_name=name) # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data indexer.add_new_doc(parsed_document) # indexer.update_posting_files() # indexer.reset_cach() self._indexer.save_index('inverted_idx') print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) def get_full_text(self, d_id): return self._indexer.documents_data[d_id][4] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ if self._indexer.inverted_idx == None: print("can't run query without inverted index been loaded") return searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
class Searcher: def __init__(self, inverted_index, corpus_size, average_length, output_path): """ :param inverted_index: dictionary of inverted index """ self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index self.corpus_size = corpus_size self.average_length = average_length self.output_path = output_path def calculate_doc_scores(self, term, relevant_docs, posting_pointer, posting_file): """ Retrieves term's posting file and calculates score for each relevant document. Adds the relevant documents to relevant_docs dictionary :param term: query term for retrieval :param relevant_docs: dictionary of relevant documents :param posting_pointer: pointer (name) of relevant posting file :param posting_file: relevant posting file :return: returns a tuple of the current relevant posting pointer and posting file """ # retrieve term's posting file if posting_pointer is None or term[0].lower( ) != posting_pointer or posting_file is None: posting_pointer = self.inverted_index[term][POSTING_POINTER_INDEX] posting_file = utils.load_obj(self.output_path + str(posting_pointer)) inverted_document_frequency = log(self.corpus_size / self.inverted_index[term][DF_INDEX]) documents = posting_file[term] for document in documents: # calculate score document_id = document[DOCUMENT_ID_INDEX] doc_weight = document[FREQUENCY_INDEX] normalized_length = document[LENGTH_INDEX] / self.average_length if document_id not in relevant_docs: relevant_docs[document_id] = 0 # calculate score according to BM25+ weighting formula relevant_docs[document_id] += inverted_document_frequency * (float( (doc_weight * (K1 + 1))) / (doc_weight + K1 * (1 - B + B * normalized_length)) + DELTA) return posting_pointer, posting_file def relevant_docs_from_posting(self, query): """ Search and retrieve relevant documents for the query. Calculate the similarity score for each document. :param query: query :return: dictionary of relevant documents and their scores """ # parse query according to the same parsing rules of the corpus entities = dict() term_dict = dict() parsed_query = self.parser.parse_sentence(query, entities) self.parser.parse_capital_letters(parsed_query, term_dict) # perform spell correction spell_checker = SpellChecker() corrected_terms = [] misspelled_terms = spell_checker.unknown([*term_dict.keys()]) for term in misspelled_terms: # only correct terms that aren't in the inverted dictionary # terms in the dictionary are considered correct for retrieval if term not in self.inverted_index: candidates = spell_checker.candidates(term) if term in candidates: # remove duplicate originally correct terms candidates.remove(term) corrected_terms.extend(candidates) # sort the parsed query alphabetically for optimal posting files retrieval # always hold at most one posting file in memory sorted_query = [*term_dict.keys()] + [*entities.keys() ] + corrected_terms sorted_query.sort() # dictionary for holding all relevant documents (at least one query term appeared in the document) # format: {document_id: score} relevant_docs = dict() posting_file = None # currently used posting file from disk posting_pointer = None # current posting's pointer for term in sorted_query: # check if term exists in inverted dictionary in either lower or upper form if term in self.inverted_index: posting_pointer, posting_file = self.calculate_doc_scores( term, relevant_docs, posting_pointer, posting_file) elif term.islower() and term.upper() in self.inverted_index: posting_pointer, posting_file = self.calculate_doc_scores( term.upper(), relevant_docs, posting_pointer, posting_file) elif term.isupper() and term.lower() in self.inverted_index: posting_pointer, posting_file = self.calculate_doc_scores( term.lower(), relevant_docs, posting_pointer, posting_file) return relevant_docs
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file self._parser.curr_idx = self.parse_and_index_tweet_list(documents_list, 0) self._indexer.save_index('idx_bench.pkl') print('Finished parsing and indexing.') def parse_and_index_tweet_list(self, documents_list, idx): for document in documents_list: # parse the document self._parser.curr_idx = idx parsed_document = self._parser.parse_doc(document) # add the doucment to indexer here self._indexer.set_idx(idx) self._indexer.add_new_doc(parsed_document) idx += 1 return idx-1 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ self._model = _Thesaurus() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): if not config: self._config = ConfigClass() else: self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self._reader = ReadFile(self._config.get__corpusPath()) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.check_pending_list() self._indexer.calculate_and_add_idf() self._indexer.calculate_sigma_Wij() self._indexer.calculate_avg_doc_len() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def search(self, query, k=None): # TODO: change """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ terms, entities = self._parser.parse_sentence(query) query_as_list = terms + entities # word net extended_query = word_net(terms) searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search_with_extension(query_as_list, extended_query, k)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ reader = ReadFile('') documents_list = reader.read_fn(fn) # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.after_indexing() self._indexer.save_index("inverted_idx") print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self.load_index("inverted_idx.pkl") searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config if self._config: if not hasattr(self._config, 'toStem'): self._config.toStem = False if not hasattr(self._config, 'toLemm'): self._config.toLemm = False self._parser = Parse() self._indexer = Indexer(config) self._model = None self.corpus_size = 0 self.load_precomputed_model() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.save_index( self._config.get_output_path()) # Save the inverted_index to disk self.corpus_size = self._indexer.get_docs_count() self.calculate_doc_weight() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ self._model = SpellCheck # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def calculate_doc_weight(self): """ The method calculates the TF-IDF for each document :return: """ for word in self._indexer.inverted_idx: for doc_id in self._indexer.inverted_idx[word]['posting_list']: normalized_term_tf = self._indexer.inverted_idx[word][ "posting_list"][doc_id][0] term_df = self._indexer.inverted_idx[word]['df'] term_idf = math.log10(self.corpus_size / term_df) # calculate doc's total weight term_weight = normalized_term_tf * term_idf self._indexer.inverted_idx[word]["posting_list"][ doc_id].append(term_weight) term_weight_squared = math.pow(term_weight, 2) self._indexer.docs_index[doc_id][0] += term_weight_squared self._indexer.docs_index[doc_id][0] = round( self._indexer.docs_index[doc_id][0], 3)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. __slots__ = [ '_config', '_indexer', '_parser', '_model', 'searcher', '_run_config', '_config' ] def __init__(self, config=None, run_config=None): if not config: config = ConfigClass() if not run_config: run_config = RunConfigClass() self._run_config = run_config self._config = config self._parser = Parse(run_config) self._indexer = Indexer(run_config) self._model = None self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") # Iterate over every document in the file for document in df.values: # parse the document parsed_list = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_list) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn.strip('.pkl')) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relevant and the last is the least relevant result. """ return self.searcher.search(query, None, {3})
class SearchEngine: def __init__(self, config=None): self._config = config self._parser = Parse(False) self.reader = ReadFile(corpus_path=config.get__corpusPath()) self._indexer = Indexer(config) self.model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in tqdm(enumerate(documents_list)): # parse the document parsed_document = self._parser.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) tuple_to_save = self._indexer.fix_inverted_index() utils.save_pickle_tuple(tuple_to_save, 'idx_engine3', self._config.get_out_path()) print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_path): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def load_index(self, fn): return self._indexer.load_index(fn) def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self._indexer.inverted_idx, self._indexer.document_dict = self.load_index( 'idx_engine3.pkl') searcher = Searcher(self._parser, self._indexer, model=self.model) # TODO check about K query_as_list = self._parser.parse_sentence(query) list_copy = list(query_as_list[0]) tagged_words = pos_tag(list_copy) for word in tagged_words: synonym = ThesaurusModel.get_synonym(word) if synonym is not None: list_copy.extend(synonym) l_res = searcher.search(list_copy) t_ids = [tup[1] for tup in l_res] return len(l_res), t_ids
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse(self._config, advanced=False) self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ number_of_documents = 0 r = ReadFile(corpus_path=self._config.get__corpusPath()) doc = r.read_file(fn) for document in doc: parsed_document = self._parser.parse_doc(document) self._indexer.add_new_doc(parsed_document) number_of_documents += 1 capital_letters = self._parser.caps_dict self._indexer.change_inverted_by_caps(capital_letters) self._indexer.save_index('idx_bench') print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(parser=self._parser, indexer=self._indexer, wordnet=False, correction=False) n_relevant, ranked_doc_ids = searcher.search(query) return n_relevant, [id for (id, rank) in ranked_doc_ids]
def __init__(self, config=None): self._config = config self._parser = Parse(self._config, advanced=False) self._indexer = Indexer(config) self._model = None
def __init__(self, config=None): self._config = config self._parser = Parse(False) self._indexer = Indexer(config)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse(False) self._indexer = Indexer(config) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ dict_of_methods = {} dict_of_methods['wordnet'] = False dict_of_methods['spell_correction'] = False dict_of_methods['thesaurus'] = False dict_of_methods['word2vec'] = True dict_of_methods['parser'] = False df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') self._indexer.slice_uncommon_terms() self._indexer.calculate_wij_idf() self._indexer.set_dict_methods(dict_of_methods) # self._indexer.save_index(fn="idx_bench") # self._indexer.save_index(self._config.get_output_path() + 'inverted_idx') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ self._model = gensim.models.KeyedVectors.load_word2vec_format( os.path.join(model_dir, "model_word2vec_last"), binary=True, encoding='utf-8', unicode_errors='ignore') self._config.set_download_model(False) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
continue if number_arr[0] >= self.avg_length: map_reduce.write_dict(tmp_pos) self.set_is_writting.add(key) number_arr[0] = 0 if term.lower() not in self.tmp_pos.keys(): tmp_pos[term.lower()] = [] if key in self.set_is_writting: map_reduce.wait_untill_finish() self.set_is_writting.remove(key) tmp_pos[term.lower()].append((document.tweet_id, document_dictionary[term])) number_arr[0] += 1 # except: # print('TERMS : _____ ' + str(term)) # print('INVERTED: problem with the following key {}'.format(term[0])) max_freq = max([document_dictionary.values()]) self.tmp_pos_doc[document.tweet_id] = document_dictionary self.num_in_pos_doc_other[0] += 1 if self.num_in_pos_doc_other[0] >= self.avg_length: self.map_reduce_doc.write_dict(self.tmp_pos_doc) self.num_in_pos_doc_other[0] = 0 if __name__ == '__main__': p = Parse(True) parsed_document = p.parse_doc(['1280914835979501568', 'Wed Jul 08 17:21:09 +0000 2020', '70% @loganxtalor: Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a bind. I…', '{}', '[]', 'Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a… https://t.co/i8IdrIKp2B', '{"https://t.co/i8IdrIKp2B":"https://twitter.com/i/web/status/1280659984628490246"}', '[[116,139]]', None, None, None, None, None, None]) i = Indexer() i.add_new_doc(parsed_document)
from reader import ReadFile from parser_module import Parse from stemmer import Stemmer import os import string preprocessed_file = "model/preprocessed.txt" corpus_path = r"C:\Users\Owner\Desktop\SearchEngine\Data" reader = ReadFile(corpus_path) parser = Parse() stemmer = Stemmer() #documents_list = reader.read_file("covid19_08-05.snappy.parquet") documents_list = [] # files_to_process = [ # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-08-2020\covid19_07-08.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-09-2020\covid19_07-09.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-10-2020\covid19_07-10.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-11-2020\covid19_07-11.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-12-2020\covid19_07-12.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-13-2020\covid19_07-13.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-15-2020\covid19_07-15.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-16-2020\covid19_07-16.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-18-2020\covid19_07-18.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-20-2020\covid19_07-20.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-04-2020\covid19_08-04.snappy.parquet", # r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-27-2020\covid19_07-27.snappy.parquet", # ] files_to_process = [ r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-07-2020\covid19_08-07.snappy.parquet",
def __init__(self, config=None): self._config = config self._parser = Parse(False) self.reader = ReadFile(corpus_path=config.get__corpusPath()) self._indexer = Indexer(config) self.model = None
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ def is_ascii(s): return all(ord(c) < 128 for c in s) df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) #this funtion is respobsible to write the entities dict to text file #def write_entites(): # file1 = open("entities.txt", "a") # start=time.time() # our_dict = sorted(self._parser.entities.items(), key=lambda item: item[1], reverse=True) # print(our_dict) # for word in our_dict: # if is_ascii(word[0]): # parsed=self._parser.parse_sentence(word[0]) # for term in parsed: # if(not term[0].isdigit() and term[0]!="#" and term[0]!="@"): # file1.writelines(str(term)+"\n") # file1.close() to_del=[] # saving the necessary data to pickle to_Save = (self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc) utils.save_obj(to_Save, "index_5") def remove_word_1(): for key in self._indexer.inverted_idx: if (self._indexer.inverted_idx[key] == 1 and key.isalpha()==False): to_del.append(key) self._indexer.postingDict.pop(key) for key in to_del: self._indexer.inverted_idx.pop(key) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ obj=utils.load_obj(fn) self._indexer.inverted_idx=obj[0] self._indexer.postingDict=obj[1] self._indexer.num_of_docs=obj[2] self._indexer.avg_Size_doc=obj[3] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self,model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query,k=2000): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer) return searcher.search(query,k) def main(self,output_path,stemming,query_to_check,num_docs_to_retrieve): self.build_index_from_parquet("data/benchmark_data_train.snappy.parquet") if isinstance(query_to_check, list): queries = query_to_check elif isinstance(query_to_check, str): if query_to_check.endswith(".txt"): try: with open(query_to_check, "r",encoding="utf-8") as queries: queries = queries.readlines() query2 = [] for q in queries: if (q != "\n"): query2.append(q) queries=query2 except FileNotFoundError as e: print(e) else: queries = [query_to_check] else: return if (stemming): output_path = output_path + "/WithStem" else: output_path = output_path + "/WithoutStem" query_num =1 for query in queries: start = time.time() mylist=self.search(query, num_docs_to_retrieve) answer_to_run=mylist[1] for doc_tuple in answer_to_run: print('tweet id: {}'.format(doc_tuple)) query_num += 1 print("time that toke to retrieve :" + str(time.time() - start))
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = SpellCChecker() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ # glove_input_file = 'glove.twitter.27B.25d.txt' # word2vec_output_file = 'glove.twitter.27B.25d.txt.word2vec' # glove2word2vec(glove_input_file, word2vec_output_file) df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data if parsed_document is None: continue self._indexer.add_new_doc(parsed_document) if len(self._indexer.inverted_idx) > 100000: self._indexer.sort_100K_inverted_index() self._indexer.add_idf_to_dictionary() print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ if ".pkl" in fn: fn = fn[:-4] self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ # self._model = KeyedVectors.load_word2vec_format('glove.twitter.27B.25d.txt.word2vec', binary=False) pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self.map_list = [] self.prec5_list = [] self.prec10_list = [] self.prec50_list = [] self.prec_total_list = [] self.recall_list = [] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ print("\nNow Starting search engine 2") total_time = datetime.now() df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) # print("len of inverted: ", len(self._indexer.inverted_idx)) # print("len of posting: ", len(self._indexer.postingDict)) # print("len of dataSet: ", len(self._indexer.benchDataSet)) # end_time = datetime.now() # print('\n ------ Time To Retrieve: {}'.format(end_time - total_time), " ------\n") # # print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query) def run_engine_two(self, fn): self.build_index_from_parquet(fn) queries_path = "data\\queries_train.tsv" all_queries = SearchEngine.query_reader( queries_path)["information_need"] for i, q in enumerate(all_queries): print(q) k, docs = self.search(q) # print(docs[:10]) self.check_engine_quality(i + 1, docs[:300]) print() print("Avg map is :", (sum(self.map_list) / len(self.map_list))) @staticmethod def query_reader(queries_path): data = pd.read_csv(queries_path, sep="\t") return data def get_parser(self): return self._parser def check_engine_quality(self, query_num, list_of_docs): """ :param query_num: :param list_of_docs: :return: no return. prints metrics of the query. precision, recall, map. """ benchmark_path = "data\\benchmark_lbls_train.csv" df = pd.read_csv(benchmark_path) df_prec = df[df['query'] == query_num] df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)] dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict() rmv_lst = [] ranking = [] # Add to list for rank for doc in list_of_docs: try: ranking.append(dict_for_data[int(doc)]) except: rmv_lst.append(doc) for d in rmv_lst: list_of_docs.remove(d) data_df = pd.DataFrame({ 'query': query_num, 'tweet': list_of_docs, 'y_true': ranking }) df_rec = df[df['query'] == query_num] recall_total = len(df_rec[df_rec['y_true'] == 1.0]) # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0])) # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0])) # print("found total of", len(df_prec), "tagged docs") # Calculate metrics and print prec5 = metrics.precision_at_n(data_df, query_num, 5) prec10 = metrics.precision_at_n(data_df, query_num, 10) prec50 = metrics.precision_at_n(data_df, query_num, 50) prec_total = metrics.precision(data_df, True, query_number=query_num) map_of_query = metrics.map(data_df) recall_val = metrics.recall_single(data_df, recall_total, query_num) self.map_list.append(map_of_query) self.prec5_list.append(prec5) self.prec10_list.append(prec10) self.prec50_list.append(prec50) self.prec_total_list.append(prec_total) self.recall_list.append(recall_val) print() print("precision at 5 of query", query_num, "is :", prec5) print("precision at 10 of query", query_num, "is :", prec10) print("precision at 50 of query", query_num, "is :", prec50) print("precision of query", query_num, "is :", prec_total) print("recall of query", query_num, "is :", recall_val) print("map of query", query_num, "is :", map_of_query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self._method = wordnet_method() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 doc_len = len(documents_list) for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document, doc_len) # print('Finished parsing and indexing.') # print('Finished marge, start rebuild posting dict') # self._indexer.rebuild_postingDict() self._indexer.rebuild_inverted_index() # print('finished rebuild inverted index') to_save = (self._indexer.inverted_idx, self._indexer.tweet_dict, self._indexer.reversed_inverted_index) utils.save_obj(to_save, 'idx_bench') # TODO: inverted_idx, tweet_dict,reversed_inverted_index, to_save change to None self._indexer.inverted_idx = None self._indexer.tweet_dict = None self._indexer.reversed_inverted_index = None to_save = None # print('Finished rebuild inverted index and build reversed_inverted_index') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model, method=self._method) return searcher.search(query)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # self._indexer.save_index("idx_bench.pkl") # # indexer_dic = utils.load_obj("idx_bench") # self._indexer.save_index("idx.pkl") # TODO - we need submit this indexer_dic = utils.load_obj("idx") # TODO - we need submit this localMethod = True globalMethod = False wordNet = False spellChecker = False if localMethod: indexer_dic["local"] = True if wordNet: indexer_dic["wordnet"] = True if spellChecker: indexer_dic["spellChecker"] = True if globalMethod: docs_dic, Sij_dic = compute_Wi(indexer_dic, globalMethod) indexer_dic["docs"] = docs_dic indexer_dic["global"] = Sij_dic else: docs_dic = compute_Wi(indexer_dic) indexer_dic["docs"] = docs_dic # utils.save_obj(indexer_dic, "idx_bench") utils.save_obj(indexer_dic, "idx") # TODO - we need submit this # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query)
def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None self._method = wordnet_method()
def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None
def __init__(self, config=None): self._config = config self._parser = Parse(self._config.toStem, self._config.toLemm) self._indexer = Indexer(config) self._model = None