class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self.config = indexer.config self._parser = parser self._indexer = indexer self.number_of_docs = indexer.num_of_docs self._model = model # self.inverted_index, self.document_dict = self._indexer.load_index("idx_engine1.pkl") self.inverted_index, self.document_dict = self._indexer.inverted_idx, self._indexer.document_dict self.glove_dict = self._indexer.glove_dict use_glove = True if len(self.glove_dict) == 0: use_glove = False self.ranker = Ranker(self.config, use_glove) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ relevant_docs, query_glove_vec, square_w_iq = self.relevant_docs_from_posting( query) ranked_docs = self.ranker.rank_relevant_doc(relevant_docs, query_glove_vec, square_w_iq) top_k = self.ranker.retrieve_top_k(ranked_docs, k) return top_k # feel free to change the signature and/or implementation of this function # or drop altogether. def relevant_docs_from_posting(self, query_as_list): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ term_to_indices = {} max_tf = 0 query_glove_vec = np.zeros(shape=25) for idx, term in enumerate(query_as_list): if term in self.glove_dict: query_glove_vec += self.glove_dict[term] try: if term in self.inverted_index: if term not in term_to_indices: idx_set = {idx} if len(idx_set) > max_tf: max_tf = len(idx_set) term_to_indices[term] = idx_set else: # term already in term dict, so only update it's index list term_to_indices[term].add(idx) if len(term_to_indices[term]) > max_tf: max_tf = len(term_to_indices[term]) else: # term is un-known idx_set = {idx} if len(idx_set) > max_tf: max_tf = len(idx_set) term_to_indices[term] = idx_set except: print('term {} not found in inverted index'.format(term)) query_glove_vec /= len(query_as_list) p = 0.5 min_num_of_words_to_relevent = int(len(query_as_list) * p) pre_doc_dict = {} pre_doc_dict_counter = Counter() relevant_docs = {} w_iq_square = 0 for term, term_indices in term_to_indices.items(): term_tf_idf = ((len(term_indices) / len(query_as_list)) * self.calc_idf(term)) w_iq_square += math.pow(term_tf_idf, 2) try: # if doc_list is not None: if term in self.inverted_index: # for doc_tuple in doc_list.items(): for tweet_id in self.inverted_index[term][1]: pre_doc_dict_counter[tweet_id] += 1 if tweet_id not in pre_doc_dict: # example - > tf_idf_vec # [[tf1, tf2...] # [idf1, idf2...]] tf_idf_numarator = 0 tf_idf_denomenator = math.sqrt( self.document_dict[tweet_id][1]) tweet_doc_length = self.inverted_index.get_doc_length( term, tweet_id) glove_vec = self.document_dict[tweet_id][0] tweet_date = self.inverted_index.get_tweet_date( term, tweet_id) pre_doc_dict[tweet_id] = [ tf_idf_numarator, tf_idf_denomenator, tweet_doc_length, glove_vec, tweet_date ] pre_doc_dict[tweet_id][ 0] += self.inverted_index.get_tf_idf( term, tweet_id) * term_tf_idf if tweet_id not in relevant_docs and \ pre_doc_dict_counter[tweet_id] >= min_num_of_words_to_relevent: relevant_docs[tweet_id] = pre_doc_dict[tweet_id] except: print('term {} not found in posting'.format(term)) return relevant_docs, query_glove_vec, math.sqrt(w_iq_square) def calculate_tf(self, tweet_term_tuple): """ calculates term frequency. :param tweet_term_tuple: tuple containing all information of the tweet of the term. :return: """ # to calc normalize tf num_of_terms_in_doc = tweet_term_tuple[1] frequency_term_in_doc = tweet_term_tuple[2] tf = frequency_term_in_doc / num_of_terms_in_doc return tf def calculate_idf_BM25(self, term_data): """ calculates idf according to BM25 algorithm. :param term_data: :return: """ n = self.number_of_docs df = term_data[0] idf = math.log(((n - df + 0.5) / (df + 0.5)) + 1) return idf def calc_idf(self, term): """ calculates idf of term :param term: term :return: """ # to calc idf n = self.number_of_docs # df = term_data[0] if term not in self.inverted_index: return 0 df = self.inverted_index[term][0] idf = math.log10(n / df) return idf
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self.terms_searched = {} self.total_num_of_docs = parser.curr_idx ############################################################################################### # #ours # the big matrix is the base for the functions # return list of list def revocer_doc_ids(self, doc_id_tf_list): tmp_add = 0 for tmp_list in doc_id_tf_list: tmp_add += tmp_list[0] tmp_list[0] = tmp_add return doc_id_tf_list # N= total amount of document in the corpus def _relevant_docs_from_posting(self, query_as_list, total_num_of_docs): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: query :return: dictionary of relevant documents. """ terms_idf = {} similar_terms = [] doc_id_dict = {} query_as_list = self._parser.parse_all_text( ' '.join(query_as_list).lower()) # if self._model is not None: if isinstance(self._model, list): query_as_list_to_extend = [] for model in self._model: if model is _SpellChecker(): query_as_list = model.improve_query(query_as_list) else: query_as_list_to_extend.extend( model.improve_query(query_as_list)) query_as_list = set(query_as_list_to_extend) else: try: query_as_list = self._model.improve_query(query_as_list) except AttributeError: print("Failed query expansion") pass # for term in query_as_list: # # query expansion # try: # similar_terms.extend(self._model.get_similar_words(term)) # list # # except AttributeError: # print("Failed query expansion") # break # if len(similar_terms) > 1: # try: # query_as_list = set(query_as_list.extend(similar_terms)) # except TypeError: # pass for new_term in query_as_list: try: if new_term not in self._indexer.term_indexer_dict.keys(): if new_term.lower( ) in self._indexer.term_indexer_dict.keys(): new_term = new_term.lower() elif new_term.upper( ) in self._indexer.term_indexer_dict.keys(): new_term = new_term.upper() if new_term in self._indexer.term_indexer_dict.keys(): df = self._indexer.term_indexer_dict[new_term][0] if df != 0: terms_idf[new_term] = math.log2( float(total_num_of_docs) / float(df)) else: terms_idf[new_term] = 0 docs_list = self._indexer.term_indexer_dict[new_term][1] doc_id_dict.update(dict(docs_list)) self.terms_searched[new_term] = dict(docs_list) except: traceback.print_exc() doc_id_list = doc_id_dict.keys() final_dict = {} try: for term in query_as_list: if term in self.terms_searched.keys(): df = terms_idf[term] for doc_id in doc_id_list: if doc_id in self.terms_searched[term].keys(): tf = self.terms_searched[term][doc_id] if term not in final_dict.keys(): final_dict[term] = [[tf, df, doc_id]] else: final_dict[term].append([tf, df, doc_id]) except: traceback.print_exc() for doc_id in doc_id_list: for term in self._indexer.file_indexer_dict[doc_id].keys(): # if term not in self.terms_searched.keys(): tf = self._indexer.file_indexer_dict[doc_id][term] df = math.log2( float(total_num_of_docs) / float(self._indexer.term_indexer_dict[term][0])) if term not in final_dict.keys(): final_dict[term] = [[tf, df, doc_id]] else: final_dict[term].append([tf, df, doc_id]) return final_dict, doc_id_list, self._indexer.file_indexer_dict ###################################################################################################################################### # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) final_dict, doc_id_list, file_indexer_dict = self._relevant_docs_from_posting( query_as_list, self.total_num_of_docs) ranked_docs_list, ranked_docs_dict = self._ranker.rank_relevant_doc( final_dict, doc_id_list, query_as_list, file_indexer_dict) #results_dict = {self._parser.doc_idx_tweet_id[k]: ranked_docs_dict[k] for k in ranked_docs_list} ranked_docs_list_top_k = self._ranker.retrieve_top_k( ranked_docs_list, k) results_list_top_k = [ self._parser.doc_idx_tweet_id[key] for key in ranked_docs_list_top_k ] return len(ranked_docs_list), results_list_top_k # feel free to change the signature and/or implementation of this function # or drop altogether. def relevant_docs_from_posting(self, query_as_list): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ relevant_docs = {} for term in query_as_list: posting_list = self._indexer.get_term_posting_list(term) for doc_id, tf in posting_list: df = relevant_docs.get(doc_id, 0) relevant_docs[doc_id] = df + 1 return relevant_docs