def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self.parser.parse_sentence(query) if self.is_thesaurus: query_as_list_with_synonym = self.thesaurus_method(query_as_list[0]) query_as_list = [query_as_list_with_synonym, None] relevant_docs = self.relevant_docs_from_posting(query_as_list) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) if k: ranked_doc_ids = Ranker.retrieve_top_k(ranked_doc_ids, k) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self._config = self._indexer.config self._method_class = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self._indexer.load_index("idx_bench.pkl") query_as_list = self._parser.parse_sentence(query)[0] query_dict, max_tf_query = self.get_query_dict(query_as_list) expanded_query_dict = self._method_class.expand_query( query_dict, max_tf_query) return self.search_helper(expanded_query_dict, k, self._method_class.p_threshold, self._method_class.p_rel) # create {term : tf} for query def get_query_dict(self, tokenized_query): max_tf = 1 query_dict = {} for index, term in enumerate(tokenized_query): if term not in query_dict: query_dict[term] = 1 else: query_dict[term] += 1 if query_dict[term] > max_tf: max_tf = query_dict[term] for term in query_dict: query_dict[term] /= max_tf return query_dict, max_tf def relevant_docs_from_posting(self, query_dict, p_threshold=0): relevant_docs = {} query_vector = np.zeros(len(query_dict), dtype=float) full_cells_threshold = round(p_threshold * len(query_vector)) for idx, term in enumerate(list(query_dict.keys())): try: # added docs_index = self.get_doc_index() tweets_per_term = self._indexer.get_term_posting_tweets_dict( term) for tweet_id, vals in tweets_per_term.items(): doc_date = docs_index[tweet_id][1] if tweet_id not in relevant_docs.keys(): relevant_docs[tweet_id] = [ np.zeros(len(query_dict), dtype=float), doc_date ] # Wij - update tweet vector in index of term with tf-idf tf_tweet = vals[0] idf_term = self._indexer.get_term_idf(term) relevant_docs[tweet_id][0][idx] = tf_tweet * idf_term # Wiq - update query vector in index of term with tf-idf tf_query = query_dict[term] query_vector[idx] = tf_query * idf_term except: pass # OPTIMIZATIONS for doc in list(relevant_docs.keys()): if np.count_nonzero(relevant_docs[doc][0]) < full_cells_threshold: del relevant_docs[doc] return relevant_docs, query_vector def set_method_type(self, method_type): if method_type == '1': self._method_class = LocalMethod(self) elif method_type == '2': self._method_class = Thesaurus(self) elif method_type == '3': self._method_class = Wordnet(self) elif method_type == '4': self._method_class = MySpellCheker(self) # elif.. more methods def get_term_index(self): return self._indexer.inverted_idx_term def get_doc_index(self): return self._indexer.inverted_idx_doc def is_term_in_index(self, term): return term in self._indexer.inverted_idx_term def search_helper(self, query_dict, k, p_threshold=0, p_relevant=0): relevant_docs, query_vector = self.relevant_docs_from_posting( query_dict, p_threshold) n_relevant = len(relevant_docs) ranked_docs = self._ranker.rank_relevant_docs(relevant_docs, query_vector) return n_relevant, self._ranker.retrieve_top_k(ranked_docs, k, p_relevant)
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self.config = indexer.config self._parser = parser self._indexer = indexer self.number_of_docs = indexer.num_of_docs self._model = model # self.inverted_index, self.document_dict = self._indexer.load_index("idx_engine1.pkl") self.inverted_index, self.document_dict = self._indexer.inverted_idx, self._indexer.document_dict self.glove_dict = self._indexer.glove_dict use_glove = True if len(self.glove_dict) == 0: use_glove = False self.ranker = Ranker(self.config, use_glove) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ relevant_docs, query_glove_vec, square_w_iq = self.relevant_docs_from_posting( query) ranked_docs = self.ranker.rank_relevant_doc(relevant_docs, query_glove_vec, square_w_iq) top_k = self.ranker.retrieve_top_k(ranked_docs, k) return top_k # feel free to change the signature and/or implementation of this function # or drop altogether. def relevant_docs_from_posting(self, query_as_list): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ term_to_indices = {} max_tf = 0 query_glove_vec = np.zeros(shape=25) for idx, term in enumerate(query_as_list): if term in self.glove_dict: query_glove_vec += self.glove_dict[term] try: if term in self.inverted_index: if term not in term_to_indices: idx_set = {idx} if len(idx_set) > max_tf: max_tf = len(idx_set) term_to_indices[term] = idx_set else: # term already in term dict, so only update it's index list term_to_indices[term].add(idx) if len(term_to_indices[term]) > max_tf: max_tf = len(term_to_indices[term]) else: # term is un-known idx_set = {idx} if len(idx_set) > max_tf: max_tf = len(idx_set) term_to_indices[term] = idx_set except: print('term {} not found in inverted index'.format(term)) query_glove_vec /= len(query_as_list) p = 0.5 min_num_of_words_to_relevent = int(len(query_as_list) * p) pre_doc_dict = {} pre_doc_dict_counter = Counter() relevant_docs = {} w_iq_square = 0 for term, term_indices in term_to_indices.items(): term_tf_idf = ((len(term_indices) / len(query_as_list)) * self.calc_idf(term)) w_iq_square += math.pow(term_tf_idf, 2) try: # if doc_list is not None: if term in self.inverted_index: # for doc_tuple in doc_list.items(): for tweet_id in self.inverted_index[term][1]: pre_doc_dict_counter[tweet_id] += 1 if tweet_id not in pre_doc_dict: # example - > tf_idf_vec # [[tf1, tf2...] # [idf1, idf2...]] tf_idf_numarator = 0 tf_idf_denomenator = math.sqrt( self.document_dict[tweet_id][1]) tweet_doc_length = self.inverted_index.get_doc_length( term, tweet_id) glove_vec = self.document_dict[tweet_id][0] tweet_date = self.inverted_index.get_tweet_date( term, tweet_id) pre_doc_dict[tweet_id] = [ tf_idf_numarator, tf_idf_denomenator, tweet_doc_length, glove_vec, tweet_date ] pre_doc_dict[tweet_id][ 0] += self.inverted_index.get_tf_idf( term, tweet_id) * term_tf_idf if tweet_id not in relevant_docs and \ pre_doc_dict_counter[tweet_id] >= min_num_of_words_to_relevent: relevant_docs[tweet_id] = pre_doc_dict[tweet_id] except: print('term {} not found in posting'.format(term)) return relevant_docs, query_glove_vec, math.sqrt(w_iq_square) def calculate_tf(self, tweet_term_tuple): """ calculates term frequency. :param tweet_term_tuple: tuple containing all information of the tweet of the term. :return: """ # to calc normalize tf num_of_terms_in_doc = tweet_term_tuple[1] frequency_term_in_doc = tweet_term_tuple[2] tf = frequency_term_in_doc / num_of_terms_in_doc return tf def calculate_idf_BM25(self, term_data): """ calculates idf according to BM25 algorithm. :param term_data: :return: """ n = self.number_of_docs df = term_data[0] idf = math.log(((n - df + 0.5) / (df + 0.5)) + 1) return idf def calc_idf(self, term): """ calculates idf of term :param term: term :return: """ # to calc idf n = self.number_of_docs # df = term_data[0] if term not in self.inverted_index: return 0 df = self.inverted_index[term][0] idf = math.log10(n / df) return idf
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self.terms_searched = {} self.total_num_of_docs = parser.curr_idx ############################################################################################### # #ours # the big matrix is the base for the functions # return list of list def revocer_doc_ids(self, doc_id_tf_list): tmp_add = 0 for tmp_list in doc_id_tf_list: tmp_add += tmp_list[0] tmp_list[0] = tmp_add return doc_id_tf_list # N= total amount of document in the corpus def _relevant_docs_from_posting(self, query_as_list, total_num_of_docs): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: query :return: dictionary of relevant documents. """ terms_idf = {} similar_terms = [] doc_id_dict = {} query_as_list = self._parser.parse_all_text( ' '.join(query_as_list).lower()) # if self._model is not None: if isinstance(self._model, list): query_as_list_to_extend = [] for model in self._model: if model is _SpellChecker(): query_as_list = model.improve_query(query_as_list) else: query_as_list_to_extend.extend( model.improve_query(query_as_list)) query_as_list = set(query_as_list_to_extend) else: try: query_as_list = self._model.improve_query(query_as_list) except AttributeError: print("Failed query expansion") pass # for term in query_as_list: # # query expansion # try: # similar_terms.extend(self._model.get_similar_words(term)) # list # # except AttributeError: # print("Failed query expansion") # break # if len(similar_terms) > 1: # try: # query_as_list = set(query_as_list.extend(similar_terms)) # except TypeError: # pass for new_term in query_as_list: try: if new_term not in self._indexer.term_indexer_dict.keys(): if new_term.lower( ) in self._indexer.term_indexer_dict.keys(): new_term = new_term.lower() elif new_term.upper( ) in self._indexer.term_indexer_dict.keys(): new_term = new_term.upper() if new_term in self._indexer.term_indexer_dict.keys(): df = self._indexer.term_indexer_dict[new_term][0] if df != 0: terms_idf[new_term] = math.log2( float(total_num_of_docs) / float(df)) else: terms_idf[new_term] = 0 docs_list = self._indexer.term_indexer_dict[new_term][1] doc_id_dict.update(dict(docs_list)) self.terms_searched[new_term] = dict(docs_list) except: traceback.print_exc() doc_id_list = doc_id_dict.keys() final_dict = {} try: for term in query_as_list: if term in self.terms_searched.keys(): df = terms_idf[term] for doc_id in doc_id_list: if doc_id in self.terms_searched[term].keys(): tf = self.terms_searched[term][doc_id] if term not in final_dict.keys(): final_dict[term] = [[tf, df, doc_id]] else: final_dict[term].append([tf, df, doc_id]) except: traceback.print_exc() for doc_id in doc_id_list: for term in self._indexer.file_indexer_dict[doc_id].keys(): # if term not in self.terms_searched.keys(): tf = self._indexer.file_indexer_dict[doc_id][term] df = math.log2( float(total_num_of_docs) / float(self._indexer.term_indexer_dict[term][0])) if term not in final_dict.keys(): final_dict[term] = [[tf, df, doc_id]] else: final_dict[term].append([tf, df, doc_id]) return final_dict, doc_id_list, self._indexer.file_indexer_dict ###################################################################################################################################### # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) final_dict, doc_id_list, file_indexer_dict = self._relevant_docs_from_posting( query_as_list, self.total_num_of_docs) ranked_docs_list, ranked_docs_dict = self._ranker.rank_relevant_doc( final_dict, doc_id_list, query_as_list, file_indexer_dict) #results_dict = {self._parser.doc_idx_tweet_id[k]: ranked_docs_dict[k] for k in ranked_docs_list} ranked_docs_list_top_k = self._ranker.retrieve_top_k( ranked_docs_list, k) results_list_top_k = [ self._parser.doc_idx_tweet_id[key] for key in ranked_docs_list_top_k ] return len(ranked_docs_list), results_list_top_k # feel free to change the signature and/or implementation of this function # or drop altogether. def relevant_docs_from_posting(self, query_as_list): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ relevant_docs = {} for term in query_as_list: posting_list = self._indexer.get_term_posting_list(term) for doc_id, tf in posting_list: df = relevant_docs.get(doc_id, 0) relevant_docs[doc_id] = df + 1 return relevant_docs