def search_and_rank_query(query, inverted_index, k, output_path, vectorDict, stemming): p = Parse(stemming) # parse query. query_as_dict = p.parse_sentence(query, term_dict={}) if len(query_as_dict.keys()) == 0: return [] searcher = Searcher(inverted_index, output_path) # search for relevant docs given the query. min threshold is 100 docs. relevant_docs = searcher.relevant_docs_from_posting( query_as_dict, 100, output_path) # rank those docs and get the top 100 of them. ranked_docs, sorted_keys = searcher.ranker.rank_relevant_doc( relevant_docs, query_as_dict, inverted_index, output_path, vectorDict) # { doc: 4, doc: 10} top_100_keys = searcher.ranker.retrieve_top_k(sorted_keys, 100) # build association matrix and expand the query. expanded_query = local_method.build_association_matrix( inverted_index, query_as_dict, top_100_keys, vectorDict) # search again, with the expanded query. relevant_docs = searcher.relevant_docs_from_posting( expanded_query, k, output_path) # rank again and return the top K (given input) ranked. ranked_docs, sorted_keys = searcher.ranker.rank_relevant_doc( relevant_docs, expanded_query, inverted_index, output_path, vectorDict) # { doc: 4, doc: 10} top_k_keys = searcher.ranker.retrieve_top_k(sorted_keys, k) top_K = [] for doc_id in top_k_keys: top_K.append(ranked_docs[doc_id]) return top_K
def search_and_rank_query(query, inverted_index, k): p = Parse() query_as_list = p.parse_sentence(query) searcher = Searcher(inverted_index) relevant_docs = searcher.relevant_docs_from_posting(query_as_list) ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs) return searcher.ranker.retrieve_top_k(ranked_docs, k)
def search_and_rank_query(query, inverted_index, k, config): p = Parse(config.toStem) query_as_list = p.parse_sentence(query) searcher = Searcher(inverted_index, config) relevant_docs, documents_dict = searcher.relevant_docs_from_posting(query_as_list) ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs, documents_dict, query_as_list) return searcher.ranker.retrieve_top_k(ranked_docs, k)
def search_and_rank_query(config, query, inverted_index, inverted_docs, k, avg_doc_len): p = Parse(config) query_as_list = p.parse_sentence(query)[0] searcher = Searcher(config, inverted_index, inverted_docs) query_dict = searcher.get_query_dict(query_as_list) relevant_docs, query_vector = searcher.relevant_docs_from_posting(query_dict) ranked_docs = searcher.ranker.rank_relevant_docs(relevant_docs, query_vector, avg_doc_len) return searcher.ranker.retrieve_top_k(ranked_docs, k)
def search_and_rank_query(query, inverted_index, document_dict, k, num_of_docs, avg_length_per_doc, config): p = Parse(config.toStem) query_as_list = p.parse_sentence(query) searcher = Searcher(inverted_index, document_dict, num_of_docs, avg_length_per_doc, glove_dict, config) # s = time.time() relevant_docs, query_glove_vec, query_vec = searcher.relevant_docs_from_posting(query_as_list[0]) # print("Time for searcher: {}".format(time.time() - s)) # s=time.time() ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs, query_glove_vec, query_vec) # print("Time for ranker: {}".format(time.time() - s)) check = searcher.ranker.retrieve_top_k(ranked_docs, k) return check
def search_and_rank_query(query, inverted_index, k, tweet_dict): p = Parse() to_return = [] for q in query: query_as_list = p.parse_sentence(q) searcher = Searcher(inverted_index) relevant_docs = searcher.relevant_docs_from_posting(query_as_list) ranked_docs = searcher.ranker.rank_relevant_doc( relevant_docs, tweet_dict) ans = searcher.ranker.retrieve_top_k(ranked_docs, k) to_return.extend(ans) return to_return
def search_and_rank_query(query, inverted_index, k, docs_data=None): global config, number_of_documents p = Parse(config.toStem) query_as_list = p.parse_sentence(query) searcher = Searcher(inverted_index, config, docs_data) relevant_docs, query_weight = searcher.relevant_docs_from_posting( query_as_list) ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs, query_weight, number_of_documents) return searcher.ranker.retrieve_top_k(ranked_docs, k)
def search_and_rank_query(query, inverted_index, k, stemming, output_path): p = Parse(stemming) query_as_list = [term.text.lower() for term in p.parse_sentence(query)] searcher = Searcher(inverted_index, os.path.join(output_path, PostingFile)) w_of_term_in_query = searcher.CalculateW(query_as_list) relevant_docs = searcher.relevant_docs_from_posting(list(w_of_term_in_query.keys())) ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs, w_of_term_in_query) output = searcher.ranker.retrieve_top_k(ranked_docs, k) return output
def search_and_rank_query(query, inverted_index, k, config=None): """ This function search for relevant docs according to the query and rank them :param query: :param inverted_index: :param k: :param config: :return: """ p = Parse(config.toStem) query_as_list = p.parse_sentence(query) searcher = Searcher(inverted_index, config) relevant_docs = searcher.relevant_docs_from_posting(query_as_list) ranked_docs = searcher.ranker.rank_relevant_doc(relevant_docs) return searcher.ranker.retrieve_top_k(ranked_docs, k)
def search_and_rank_query(queries, inverted_index, k, lda): #print("start:", datetime.now()) # config = ConfigClass() indexer = Indexer(config) # indexer = Indexer(config) to_stem = config.get__toStem() # to_stem = config.get__toStem() queries_list = [] if type(queries) is list: # if queries is a list for query in queries: queries_list.append(query) if type(queries) is str: # if queries is a text file with open(queries, encoding='utf-8') as f: for line in f: if line != "\n": queries_list.append(line) all_results = [] query_num = 1 tweet_id_num = 1 for query in queries_list: p = Parse(config) # parse LDA query tokenized_query = p.parse_sentence(query, 0) original_query_list = query.split(" ") stop_words = stopwords.words('english') original_query_list = [ w for w in original_query_list if w not in stop_words ] # find long terms and upper case words counter = 0 while counter < len(original_query_list): len_term = 1 word = original_query_list[counter] if word.isupper(): # NBA if word.find("\n") != -1: word = word[:-1] if word.find(".") != -1: word = word[:-1] if not to_stem: tokenized_query.append(word) else: stem_word = Stemmer().stem_term(word) tokenized_query.append(stem_word) elif len(word) > 1 and re.search( '[a-zA-Z]', word) and word[0].isupper(): # upper first char term = word if original_query_list.index(word) + 1 < len( original_query_list): index = original_query_list.index(word) + 1 while index < len(original_query_list): # find all term if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \ original_query_list[index][0].isupper(): new_word2 = original_query_list[index][ 0] + original_query_list[index][1:].lower( ) # Donald Trump term += " " + new_word2 index += 1 len_term += 1 else: break if len_term > 1: tokenized_query.append(term) counter += len_term #print(tokenized_query) # WordNet query wn = WordNet_ranker(tokenized_query) WordNet_query = wn.extend_query() #print("WordNet_query", WordNet_query) searcher = Searcher(inverted_index) #print("inverted_index", len(inverted_index)) # find relevant_docs relevant_docs = searcher.relevant_docs_from_posting(WordNet_query) #print("relevant", len(relevant_docs)) # find LDA relevant cosine_dict = lda.prob(tokenized_query) #print("cosine dict", len(cosine_dict)) dict_of_cosine_tweets = {} #list out keys and values separately key_list = list(indexer.tweet_line_dict.keys()) val_list = list(indexer.tweet_line_dict.values()) for index in cosine_dict.keys(): # find the tweet id dict_of_cosine_tweets[key_list[val_list.index( index)]] = cosine_dict[index] #print("finish_topic relevant", len(dict_of_cosine_tweets)) final_dict = {} for tweet_id in dict_of_cosine_tweets.keys(): if k > len(final_dict): if tweet_id in relevant_docs: final_dict[tweet_id] = 0 final_dict[tweet_id] += (relevant_docs[tweet_id] + dict_of_cosine_tweets[tweet_id]) sorted_cosine_tweets = { k: v for k, v in sorted( final_dict.items(), key=lambda item: item[1], reverse=True) } final_tweets = list(sorted_cosine_tweets.keys()) #print("final before add K", len(final_tweets)) if k > len(final_tweets): for key in relevant_docs.keys(): if key not in final_dict: if k > len(final_tweets): final_tweets.append(key) if k == len(final_tweets): break #print("final after K", len(final_tweets)) #print("relevant", relevant_docs) #print("sorted_cosine_tweets", sorted_cosine_tweets) """for tweet in relevant_docs.keys(): if tweet in list_of_cosine_tweets: if len(final_tweets) < k: final_tweets.append(tweet) if len(final_tweets) < k: sorted_cosine_tweets = {k: v for k, v in sorted(list_of_cosine_tweets.items(), key=lambda item: item[1], reverse=True)} for key in sorted_cosine_tweets: if k > len(final_tweets) and key not in final_tweets: final_tweets.append(key) else: break""" # write the results into csv file tweet_id_num = 1 s = "" with open('results.csv', 'a', encoding='utf-8') as fp: for p in final_tweets: s = ("Tweet id: " + "{" + p + "}" + " Score: " + "{" + str(tweet_id_num) + "}" + "\n") tweet_id_num += 1 fp.write(s) query_num += 1 all_results.append(final_tweets) #print("end:", datetime.now()) # return top K of final_tweets return all_results
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ r = ReadFile() df = r.read_file(fn) documents_list = df # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # self._indexer.save_index('idx_bench.pkl') # self._indexer.save_index('inverted_idx.pkl') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) query_as_list = self._parser.parse_sentence(query) add_to_query = {} for q in query_as_list: for syn in wordnet.synsets(q): for lemma in syn.lemmas(): if lemma.name() == q.lower(): continue score = wordnet.synsets(q)[0].wup_similarity(syn) if score is not None and score > 0.8: add_to_query[lemma.name()] = score if len(add_to_query) > 3: add_to_query = sorted(add_to_query.items(), key=lambda item: item[1], reverse=True) query_as_list.extend([add_to_query[0][0], add_to_query[1][0], add_to_query[2][0]]) else: query_as_list.extend(add_to_query) new_query = ' '.join(query_as_list) relevant_docs = searcher.search(new_query) return relevant_docs @property def indexer(self): return self._indexer
class SearchEngine: num_of_tweets = 0 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None def get_num_of_tweets(self): return self.num_of_tweets # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() self.num_of_tweets = len(documents_list) # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) parsed_document.num_of_tweets = self.num_of_tweets number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) print('Finished parsing and indexing.') # TODO: check indexer saving utils.save_obj(self._indexer.inverted_idx, "inverted_idx") # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ inverted_idx = self._indexer.load_index(fn) return inverted_idx # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query, 0) original_query_list = query.split(" ") stop_words = stopwords.words('english') original_query_list = [ w for w in original_query_list if w not in stop_words ] # find long terms and upper case words counter = 0 while counter < len(original_query_list): len_term = 1 word = original_query_list[counter] if word.isupper(): # NBA if word.find("\n") != -1: word = word[:-1] if word.find(".") != -1: word = word[:-1] query_as_list.append(word) elif len(word) > 1 and re.search( '[a-zA-Z]', word) and word[0].isupper(): # upper first char term = word if original_query_list.index(word) + 1 < len( original_query_list): index = original_query_list.index(word) + 1 while index < len(original_query_list): # find all term if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]', original_query_list[index]) and \ original_query_list[index][0].isupper(): new_word2 = original_query_list[index][ 0] + original_query_list[index][1:].lower( ) # Donald Trump term += " " + new_word2 index += 1 len_term += 1 else: break if len_term > 1: query_as_list.append(term) counter += len_term wordNet = WordNet_ranker(query_as_list) new_query = wordNet.extend_query() searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(new_query) # TODO: add K results
class SearchEngine: GLOVE_PATH_SERVER = '../../../../glove.twitter.27B.25d.txt' GLOVE_PATH_LOCAL = '.\model/model.txt' def __init__(self, config=None): self._config = config self._parser = Parse(False) self.reader = ReadFile(corpus_path=config.get__corpusPath()) self._indexer = Indexer(config) self.model = self.initialize_glove_dict() self._indexer.set_glove_dict(self.model) def initialize_glove_dict(self): glove_dict = {} with open(self.GLOVE_PATH_LOCAL, 'r', encoding='utf-8') as f: for line in tqdm(f): values = line.split() word = values[0] vector = np.asarray(values[1:], "float32") glove_dict[word] = vector return glove_dict # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in tqdm(enumerate(documents_list)): # parse the document parsed_document = self._parser.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) tuple_to_save = self._indexer.fix_inverted_index() utils.save_pickle_tuple(tuple_to_save, 'idx_engine1', self._config.get_out_path()) print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_path): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def load_index(self, fn): return self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self._indexer.inverted_idx, self._indexer.document_dict = self.load_index( 'idx_engine1.pkl') searcher = Searcher(self._parser, self._indexer, model=self.model) # TODO check about K query_as_list = self._parser.parse_sentence(query) l_res = searcher.search(query_as_list[0]) t_ids = [tup[1] for tup in l_res] return len(l_res), t_ids
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): if not config: self._config = ConfigClass() else: self._config = config self._parser = Parse() self._indexer = Indexer(self._config) self._model = None self._reader = ReadFile(self._config.get__corpusPath()) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.check_pending_list() self._indexer.calculate_and_add_idf() self._indexer.calculate_sigma_Wij() self._indexer.calculate_avg_doc_len() # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implmentation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_tuple = self._parser.parse_sentence(query) query_as_list = query_as_tuple[0] + query_as_tuple[1] searcher = Searcher(self._parser, self._indexer, model=self._model) return searcher.search(query_as_list, k)
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ r = ReadFile() df = r.read_file(fn) documents_list = df # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) self._indexer.save_index('inverted_idx.pkl') # self._indexer.save_index('idx_bench.pkl') print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) # spell checker query_as_list = self._parser.parse_sentence(query) inverted_idx = self.indexer.inverted_idx spell = SpellChecker() misspelled = spell.unknown(query_as_list) assist = [x.lower() for x in query_as_list] # all the query terms in lower case for word in misspelled: if word.upper() in inverted_idx.keys() or word.lower( ) in inverted_idx.keys() or ' ' in word: continue # if the word is in the inverted index- no correction need word_idx = assist.index(word) corrections = spell.edit_distance_1( word ) # list of all the suggested corrections with distance value 1 corrections_dict = {} # check if the suggested corrections is in inverted index and collect the frequency of each correction for correction in corrections: if correction.upper() in inverted_idx.keys(): corrections_dict[correction] = inverted_idx[ correction.upper()] if correction.lower() in inverted_idx.keys(): corrections_dict[correction] = inverted_idx[ correction.lower()] if corrections_dict: query_as_list[word_idx] = max( corrections_dict, key=corrections_dict.get ) # choose the most common correction else: query_as_list[word_idx] = spell.correction(word) new_query = ' '.join(query_as_list) relevant_docs = searcher.search(new_query) return relevant_docs @property def indexer(self): return self._indexer
class Searcher: def __init__(self, inverted_index, corpus_size, average_length, output_path): """ :param inverted_index: dictionary of inverted index """ self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index self.corpus_size = corpus_size self.average_length = average_length self.output_path = output_path def calculate_doc_scores(self, term, relevant_docs, posting_pointer, posting_file): """ Retrieves term's posting file and calculates score for each relevant document. Adds the relevant documents to relevant_docs dictionary :param term: query term for retrieval :param relevant_docs: dictionary of relevant documents :param posting_pointer: pointer (name) of relevant posting file :param posting_file: relevant posting file :return: returns a tuple of the current relevant posting pointer and posting file """ # retrieve term's posting file if posting_pointer is None or term[0].lower( ) != posting_pointer or posting_file is None: posting_pointer = self.inverted_index[term][POSTING_POINTER_INDEX] posting_file = utils.load_obj(self.output_path + str(posting_pointer)) inverted_document_frequency = log(self.corpus_size / self.inverted_index[term][DF_INDEX]) documents = posting_file[term] for document in documents: # calculate score document_id = document[DOCUMENT_ID_INDEX] doc_weight = document[FREQUENCY_INDEX] normalized_length = document[LENGTH_INDEX] / self.average_length if document_id not in relevant_docs: relevant_docs[document_id] = 0 # calculate score according to BM25+ weighting formula relevant_docs[document_id] += inverted_document_frequency * (float( (doc_weight * (K1 + 1))) / (doc_weight + K1 * (1 - B + B * normalized_length)) + DELTA) return posting_pointer, posting_file def relevant_docs_from_posting(self, query): """ Search and retrieve relevant documents for the query. Calculate the similarity score for each document. :param query: query :return: dictionary of relevant documents and their scores """ # parse query according to the same parsing rules of the corpus entities = dict() term_dict = dict() parsed_query = self.parser.parse_sentence(query, entities) self.parser.parse_capital_letters(parsed_query, term_dict) # perform spell correction spell_checker = SpellChecker() corrected_terms = [] misspelled_terms = spell_checker.unknown([*term_dict.keys()]) for term in misspelled_terms: # only correct terms that aren't in the inverted dictionary # terms in the dictionary are considered correct for retrieval if term not in self.inverted_index: candidates = spell_checker.candidates(term) if term in candidates: # remove duplicate originally correct terms candidates.remove(term) corrected_terms.extend(candidates) # sort the parsed query alphabetically for optimal posting files retrieval # always hold at most one posting file in memory sorted_query = [*term_dict.keys()] + [*entities.keys() ] + corrected_terms sorted_query.sort() # dictionary for holding all relevant documents (at least one query term appeared in the document) # format: {document_id: score} relevant_docs = dict() posting_file = None # currently used posting file from disk posting_pointer = None # current posting's pointer for term in sorted_query: # check if term exists in inverted dictionary in either lower or upper form if term in self.inverted_index: posting_pointer, posting_file = self.calculate_doc_scores( term, relevant_docs, posting_pointer, posting_file) elif term.islower() and term.upper() in self.inverted_index: posting_pointer, posting_file = self.calculate_doc_scores( term.upper(), relevant_docs, posting_pointer, posting_file) elif term.isupper() and term.lower() in self.inverted_index: posting_pointer, posting_file = self.calculate_doc_scores( term.lower(), relevant_docs, posting_pointer, posting_file) return relevant_docs
class SearchEngine: def __init__(self, config=None): self._config = config self._parser = Parse(False) self.reader = ReadFile(corpus_path=config.get__corpusPath()) self._indexer = Indexer(config) self.model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() # Iterate over every document in the file number_of_documents = 0 for idx, document in tqdm(enumerate(documents_list)): # parse the document parsed_document = self._parser.parse_doc(document) if parsed_document is None: continue number_of_documents += 1 # index the document data self._indexer.add_new_doc(parsed_document) tuple_to_save = self._indexer.fix_inverted_index() utils.save_pickle_tuple(tuple_to_save, 'idx_engine2', self._config.get_out_path()) print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_path): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass def load_index(self, fn): return self._indexer.load_index(fn) def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self._indexer.inverted_idx, self._indexer.document_dict = self.load_index( 'idx_engine2.pkl') searcher = Searcher(self._parser, self._indexer, model=self.model) # TODO check about K query_as_list = self._parser.parse_sentence(query) list_copy = list(query_as_list[0]) tagged_words = pos_tag(list_copy) for word in tagged_words: wn_tag = Wordnet.get_wordnet_pos(word[1]) synonym = Wordnet.get_closest_term(word[0], wn_tag) if synonym is not None: list_copy.append(synonym) l_res = searcher.search(list_copy) t_ids = [tup[1] for tup in l_res] return len(l_res), t_ids
class SearchEngine: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation, but you must have a parser and an indexer. def __init__(self, config=None): self._config = config self._parser = Parse() self._indexer = Indexer(config) self.invertedIndex = self._indexer.inverted_idx self._model = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def build_index_from_parquet(self, fn): """ Reads parquet file and passes it to the parser, then indexer. Input: fn - path to parquet file Output: No output, just modifies the internal _indexer object. """ # r = ReadFile(ConfigClass.corpusPath) # documents_list = r.readAllCorpus() #change if we need to read more then 1 parquet df = pd.read_parquet(fn, engine="pyarrow") documents_list = df.values.tolist() utils.save_obj( {}, "inverted_idx" ) # needed to pass boris tests, sometimes, inverted_idx fails to save in testings system # Iterate over every document in the file number_of_documents = 0 for idx, document in enumerate(documents_list): # parse the document parsed_document = self._parser.parse_doc(document) number_of_documents += 1 if parsed_document.doc_length != 0: #sometimes we get an empty tweet, no need to index them # index the document data self._indexer.add_new_doc(parsed_document) # Inserting entities to the indexer and posting files self._indexer.addEntities(self._parser.suspectedEntityDict) # Sort the posting files self._indexer.update_idfWij(idx) self._indexer.save_index("inverted_idx") print('Finished parsing and indexing.') # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ self._indexer.load_index(fn) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def load_precomputed_model(self, model_dir=None): """ Loads a pre-computed model (or models) so we can answer queries. This is where you would load models like word2vec, LSI, LDA, etc. and assign to self._model, which is passed on to the searcher at query time. """ pass # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) self._parser.suspectedEntityDict = {} query_as_list = self._parser.parse_sentence(query) # add entities to query - entities doesn't adds to query_as_list in parse_sentence # suspectedEntityDict holds only entities from original query for entity in self._parser.suspectedEntityDict: query_as_list.append(entity) # Clear query from Entities parts query_as_list = self.clearEntitiesParts(query_as_list) # WordNet expenssion extendedQ = copy.deepcopy(query_as_list) for term in query_as_list: synset = wordnet.synsets(term) try: for i in range(2): Synonym = synset[i].lemmas()[0].name() if term.lower() != Synonym.lower( ) and Synonym + "~" not in extendedQ: Synonym += "~" extendedQ.append(Synonym) except: continue query_as_list = extendedQ numberOFresults, relevantDocIdList = searcher.search( query_as_list ) # returns tuple (number of results,relevantDocIdList) return numberOFresults, relevantDocIdList def clearEntitiesParts(self, query): modifiedQuery_l = copy.deepcopy(query) termsToRemoveFromQuery = [] # at this point if query holds Entity, it will hold the terms builds the Entity and the Entity as 1 term # this is why this part below for : ['BILL','Gates','blabla','bla','Bill Gates'] # if "Bill Gates" is already known Entity it will leave us with: ['blabla','bla','Bill Gates'] for term in query: # cleaning parts of entities from the query if the entity exist in the inverted index if " " in term: if term in self.invertedIndex: # entity and in inverted Index # modifiedQuery_l.append(term) entity_l = term.split(" ") for word in entity_l: try: termsToRemoveFromQuery.append(word.upper()) except: termsToRemoveFromQuery.append(word.lower()) else: # unknown entity modifiedQuery_l.remove(term) for word in termsToRemoveFromQuery: #clear all appears of token from modifiedQuery modifiedQuery_l[:] = [x for x in modifiedQuery_l if x != word] query = modifiedQuery_l return query