def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) query_as_list_model_1 = query_as_list if (self._model != None): query_as_list_model_1 = self._model.extend_query(query_as_list) if (self._model_1 != None): query_as_list_model_2 = self._model_1.extend_query(query_as_list) query_as_list_model_1.extend(query_as_list_model_2) query_as_list = query_as_list_model_1 docs_dict = self._indexer.get_docs_dict() relevant_docs, query_dict = self._relevant_docs_from_posting( query_as_list) if relevant_docs == None or len(relevant_docs) == 0: return 0, [] relevant_docs1 = self._ranker.BM25(relevant_docs, query_as_list, docs_dict, query_dict) n_relevant = len(relevant_docs1) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs1, 2000) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) q_new_spelling, wrongWords = self.do_spelling(query_as_list) # print("query_as_list: ", query_as_list) # print("q_new_spelling: ", q_new_spelling) # print("wrongWords: ", wrongWords) query_as_list = self.deleteWrongSpelledWords(query_as_list, wrongWords) self.upper_lower_case(query_as_list, self._indexer) self.upper_lower_case(q_new_spelling, self._indexer) self.upper_lower_case(wrongWords, self._indexer) # print("query as list: ", query_as_list) # print("wordnet :", q_wordnet) # Find relevant docs relevant_docs = self._relevant_docs_from_posting(query_as_list + q_new_spelling + wrongWords) n_relevant = len(relevant_docs) # Send all to ranking ranked_doc_ids = Ranker.rank_relevant_docs( query_as_list + q_new_spelling, wrongWords, relevant_docs, self._indexer, k) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_dict = self._parser.parse_query(query) # wordnet for word in query_as_dict.copy().keys(): syn = [] # if word not in self._indexer.inverted_idx: for synset in wordnet.synsets(word): for lemma in synset.lemmas(): syn.append(lemma.name().replace('_', ' ')) # add the synonyms for s in syn: if s not in query_as_dict and s in self._indexer.inverted_idx: query_as_dict[s] = 1 break relevant_docs = self._relevant_docs_from_posting(query_as_dict) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) n_relevant = len(ranked_doc_ids) # print("SE1 top5:") # print(ranked_doc_ids[:5]) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_dict = self._parser.parse_query(query) # thesaurus for word in query_as_dict.copy().keys(): if len(thes.synonyms(word)[1][1]): syn = list(thes.synonyms(word)[1][1])[:30] for s in syn: if s not in query_as_dict and s in self._indexer.inverted_idx: query_as_dict[s] = 1 break relevant_docs = self._relevant_docs_from_posting(query_as_dict) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) # print("SE4 top5:") # print(ranked_doc_ids[:5]) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
def search_with_extension(self, query, extension, k=None): query_as_dict = self.get_query_by_inverted_index(query) query_as_dict = self.get_extended_and_query_by_inverted_index( extension, query_as_dict) relevant_docs = self._relevant_docs_from_posting(query_as_dict.keys()) n_relevant = len(relevant_docs) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs, query_as_dict, k) return n_relevant, ranked_doc_ids
def search(self, query, k): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) relevant_docs = self._relevant_docs_from_posting(query_as_list) n_relevant = len(relevant_docs) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) return n_relevant, ranked_doc_ids[:k]
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list, entities = self._parser.parse_sentence(query) entities = entities.keys() query_as_list.extend(entities) query_expand = [] keys = self._indexer.inverted_idx.keys() query_expand = [] if self._model.__class__.__name__ == 'GlobalMethod': self._model.execute_global_method_and_generate_matrix( inverted_index=self._indexer.inverted_idx, postingDic=self._indexer.postingDict) for word in query_as_list: temp_words = self._model.expand_query(word) for inner_word in temp_words: if inner_word in keys and inner_word not in query_expand: query_expand.append(str(inner_word)) for term in query_as_list: if term in keys and term not in query_expand: query_expand.append(str(term)) elif term.upper() in keys and term not in query_expand: query_expand.append(str(term.upper())) elif term.lower() in keys and term not in query_expand: query_expand.append(str(term.lower())) relevant_docs = self._relevant_docs_from_posting(query_expand) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: 2 parameters are returned - number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_object = self._parser.parse_query(query) relevant_docs = self._relevant_docs_from_posting(query_object) normalized_query = self.normalized_query(query_object) n_relevant = len(relevant_docs) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs, normalized_query, self._indexer.docs_dict, k) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relevant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) relevant_docs = self.relevant_docs_from_posting(query_as_list) n_relevant = len(relevant_docs) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs, k) # print(np.percentile(list(dict(ranked_doc_ids).values()), 10)) # print(max(dict(ranked_doc_ids).items(), key=operator.itemgetter(1))) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_dict = self._parser.parse_query(query) query_as_list = self.get_list_words(query_as_dict) # query_as_list = self._parser.parse_sentence(query) # # thesaurus # for word in query_as_list.copy(): # if len(thes.synonyms(word)[1][1]): # syn = list(thes.synonyms(word)[1][1])[:30] # for s in syn: # if s not in query_as_list and s in self._indexer.inverted_idx: # query_as_list.append(s) # break # # wordnet # for word in query_as_list.copy(): # syn = set() # # if word not in self._indexer.inverted_idx: # for synset in wordnet.synsets(word): # for lemma in synset.lemmas(): # syn.add(lemma.name().replace('_', ' ')) # add the synonyms # for s in syn: # if s not in query_as_list and s in self._indexer.inverted_idx: # query_as_list.append(s) # break relevant_docs = self._relevant_docs_from_posting(query_as_list) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) # print("SE5 top5:") # print(ranked_doc_ids[:5]) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) # Find wordNet and Thesaurus words q_wordnet = searcher_Wordnet.Searcher.do_wordnet(query_as_list) q_thesaurus = searcher_Thesaurus.Searcher.do_thesaurus(query_as_list) # q_new_spelling, wrongWords = searcher_Spelling.Searcher.do_spelling(query_as_list) # Upper lower case searcher_Wordnet.Searcher.upper_lower_case(query_as_list, self._indexer) searcher_Wordnet.Searcher.upper_lower_case(q_wordnet, self._indexer) searcher_Wordnet.Searcher.upper_lower_case(q_thesaurus, self._indexer) # searcher_Wordnet.Searcher.upper_lower_case(wrongWords, self._indexer) # searcher_Wordnet.Searcher.upper_lower_case(q_new_spelling, self._indexer) # print("query as list: ", query_as_list) # print("wordnet :", q_wordnet) complete_query = query_as_list added_words = q_wordnet + q_thesaurus relevant_docs = self._relevant_docs_from_posting(complete_query + added_words) n_relevant = len(relevant_docs) # send to ranking the wordNet + Thesaurus together ranked_doc_ids = Ranker.rank_relevant_docs(complete_query, added_words, relevant_docs, self._indexer, k) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_dict = self._parser.parse_query(query) # spell checker query_as_list = query_as_dict.copy().keys() spell = SpellChecker() # misspeled = spell.unknown(query_as_list) # for word in misspeled: for word in query_as_list: if ' ' not in word: # correct_word = spell.correction(word) correct_words = spell.candidates(word) correct_word = '' for c_word in correct_words: if c_word != word and c_word in self._indexer.inverted_idx: correct_word = c_word break if len(correct_word) == 0: continue query_as_dict[correct_word] = 1 relevant_docs = self._relevant_docs_from_posting(query_as_dict) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) # print("SE3 top5:") # print(ranked_doc_ids[:5]) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) query_as_list = self._model.spellCheck(query_as_list) relevant_docs, Ranker.query_weight = self._relevant_docs_from_posting( query_as_list) ranked_doc_ids = Ranker.rank_relevant_docs( relevant_docs, self._indexer.get_docs_count()) n_relevant = len(ranked_doc_ids) ranked_doc_ids = [doc_id for doc_id, rank in ranked_doc_ids] return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self.parser.parse_sentence(query) if self.is_thesaurus: query_as_list_with_synonym = self.thesaurus_method(query_as_list[0]) query_as_list = [query_as_list_with_synonym, None] relevant_docs = self.relevant_docs_from_posting(query_as_list) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) if k: ranked_doc_ids = Ranker.retrieve_top_k(ranked_doc_ids, k) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ # query_as_list = self._parser.parse_sentence(query) query_as_list = self._parser.parse_query(query) relevant_docs = self._relevant_docs_from_posting(query_as_list) n_relevant = len(relevant_docs) ranked_doc_ids = self._ranker.rank_relevant_docs( self._indexer.documents_dict, self._model, relevant_docs, query_as_list, k) # print(n_relevant, ranked_doc_ids) return n_relevant, ranked_doc_ids # feel free to change the signature and/or implementation of this function # or drop altogether. def _relevant_docs_from_posting(self, query_as_list): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ relevant_docs = {} for term in query_as_list: posting_list = self._indexer.get_term_posting_list(term) for doc_id, tf in posting_list: df = relevant_docs.get(doc_id, 0) relevant_docs[doc_id] = df + 1 min_len = min(2000, len(relevant_docs)) relevant_docs_sorted = dict( sorted(relevant_docs.items(), key=lambda item: item[1], reverse=True)[:min_len]) return relevant_docs_sorted def basic_search(self, query, k=None): query_as_list = self._parser.parse_query(query) relevant_docs = self._relevant_docs_from_posting(query_as_list) n_relevant = len(relevant_docs) ranked_doc_ids = self._ranker.basic_rank_relevant_docs( relevant_docs, k) return n_relevant, ranked_doc_ids def wordnet_search(self, query, k=None): # nltk.download('wordnet') query_as_list = self._parser.parse_query(query) query_tmp = list(query_as_list) for term in query_tmp: synonyms = wordnet.synsets(term.lower()) for synonym in synonyms: extra_term = synonym.lemmas()[0].name() if extra_term != term.lower(): query_as_list.append(extra_term) break relevant_docs = self._relevant_docs_from_posting(query_as_list) n_relevant = len(relevant_docs) ranked_doc_ids = self._ranker.basic_rank_relevant_docs( relevant_docs, k) return n_relevant, ranked_doc_ids
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): # self._model = model self.parser = parser self.ranker = Ranker(indexer.tweet_info) self.inverted_index = indexer.inverted_idx self.firstUnion = True self.posting_dir = ConfigClass.get_output() self.DocsToRetrieve = ConfigClass.numOfDocsToRetrieve self.scoreLowerBoundFactor = 0.5 # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - (list after extension, len of original query) k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = query relevant_docs, qLen = self.relevant_docs_from_posting(query_as_list) n_relevant = len(relevant_docs) ranked_doc_ids = self.ranker.rank_relevant_docs( relevant_docs, query_as_list) if k is not None and k > 0 and k < n_relevant: ranked_doc_ids = ranked_doc_ids[:k] return k, ranked_doc_ids return n_relevant, ranked_doc_ids """ This function count the amount of words from query that appear in each document. :param query: query :return: list (max size of self.DocsToRetrieve ) of relevant documents (first will be document that all terms in the query appeared in the tweet), and len of the query """ def relevant_docs_from_posting(self, query): #the query comes at its Expended form sorted_l = [] if len(query) == 0: # empty query return [], 0 relevantDocs_pq = PriorityQueue() modifiedQuery_l = copy.deepcopy(query) termsToRemoveFromQuery = [] # at this point if query holds Entity, it will hold the terms builds the Entity and the Entity as 1 term # this is why this part below for : ['BILL','Gates','blabla','bla','Bill Gates'] # if "Bill Gates" is already known Entity it will leave us with: ['blabla','bla','Bill Gates'] for term in query: # cleaning parts of entities from the query if the entity exist in the inverted index if " " in term: if term in self.inverted_index: # entity and in inverted Index # modifiedQuery_l.append(term) entity_l = term.split(" ") for word in entity_l: try: termsToRemoveFromQuery.append(word.upper()) except: termsToRemoveFromQuery.append(word.lower()) else: # unknown entity modifiedQuery_l.remove(term) for word in termsToRemoveFromQuery: #clear all appears of token from modifiedQuery modifiedQuery_l[:] = [x for x in modifiedQuery_l if x != word] query = modifiedQuery_l #count the original query len (words without ~ at the end) i = 0 notExpendedQueryLen = 0 word = query[i] while word[-1] != '~': notExpendedQueryLen += 1 i += 1 try: word = query[i] except: break if notExpendedQueryLen > ConfigClass.shortQueryLen: #long query self.scoreLowerBoundFactor = ConfigClass.longQueryFactor else: self.scoreLowerBoundFactor = ConfigClass.shortQueryFactor minScoreForEntry = self.scoreLowerBoundFactor * notExpendedQueryLen # clear query for terms that are not in Inverted Index: listOfValidTerms = [] for term in query: # term can be mix of upper and lower, or one of them. if not term doesn't exist in II expendedToken = False if term[-1] == "~": expendedToken = True term = term[:-1] #count the number of words in query that appear in inverted index(doesnt matter if lower or upper) if term.lower() in self.inverted_index: if expendedToken: term += "~" term = term.lower() elif term.upper() in self.inverted_index: if expendedToken: term += "~" term = term.upper() elif term in self.inverted_index: # only for entity if expendedToken: term += "~" else: continue listOfValidTerms.append(term) ################################################### ####### SEARCHING PART ##### ################################################### numOfValidTerms = len(listOfValidTerms) if numOfValidTerms == 0: # No vaild terms in query return [], 0 if numOfValidTerms == 1: # Only 1 word out of the query was founded in II term = listOfValidTerms[0] if term[-1] == "~": #only for expended token factor = ConfigClass.expendedWordWeight term = term[:-1] else: factor = ConfigClass.wordFromOGQueryWeight nodes_l = self.inverted_index[term][2] for node in nodes_l: max_f = self.ranker.tweets_info[node.tweetID][1] self.ranker.tweet_SigmaWij_d[node.tweetID] = [ node.tf * max_f, node.Wij * factor ] # node.tf*max_f : is a rollback to num of appearance of term in tweet score = node.tf * max_f # num of appear of query word in this specific tweet-node ################################## if score >= minScoreForEntry: relevantDocs_pq.put( (-score, node.tweetID) ) # -score is to reverse the queue to max priority first. ################################## # relevantDocs_pq.put((-score, node.tweetID)) # -score is to reverse the queue to max priority first. while len(sorted_l ) < self.DocsToRetrieve and relevantDocs_pq.qsize() > 0: # while relevantDocs_pq.qsize() > 0: itemFromPq = relevantDocs_pq.get() positiveScore_tweetID_Tuple = (-itemFromPq[0], itemFromPq[1]) sorted_l.append(positiveScore_tweetID_Tuple) return sorted_l, len(query) # query len > 1 # first time we init the tweet_SigmaWij_d with values from first list of nodes that we'll unite with others later factor = ConfigClass.wordFromOGQueryWeight term = listOfValidTerms[0] if term[-1] == "~": #check if first word in valid terms is an expended word term = term[:-1] factor = ConfigClass.expendedWordWeight unionList = self.inverted_index[term][2] # list of nodes for node in unionList: self.ranker.tweet_SigmaWij_d[node.tweetID] = [1, node.Wij * factor] for i in range(1, len(listOfValidTerms)): term = listOfValidTerms[i] if term[-1] == "~": # check if first word in valid terms is an expended word term = term[:-1] factor = ConfigClass.expendedWordWeight else: factor = ConfigClass.wordFromOGQueryWeight unionList = self.UnionLists(unionList, self.inverted_index[term][2], factor) for node in unionList: score = self.ranker.tweet_SigmaWij_d[node.tweetID][ 0] # num of appear of query word in this specific tweet-node ################################## if score >= minScoreForEntry: relevantDocs_pq.put( (-score, node.tweetID) ) # -score is to reverse the queue to max priority first. ################################## # relevantDocs_pq.put((-score, node.tweetID)) # -score is to reverse the queue to max priority first. while len(sorted_l) < self.DocsToRetrieve and relevantDocs_pq.qsize( ) > 0: # while relevantDocs_pq.qsize() > 0: itemFromPq = relevantDocs_pq.get() positiveScore_tweetID_Tuple = (-itemFromPq[0], itemFromPq[1]) sorted_l.append(positiveScore_tweetID_Tuple) return sorted_l, len(query) def UnionLists( self, listA, listB, factor ): #factor will be 1 for term from original query, 0.5 foe expended term listA_len, listB_len = len(listA), len(listB) tweet_SigmaWij_d = self.ranker.tweet_SigmaWij_d a, b = 0, 0 union_l = [] while a < listA_len and b < listB_len: if listA[a] < listB[b]: union_l.append(listA[a]) a += 1 elif listB[b] < listA[a]: # update Wij, update numOfApeers in tweet union_l.append(listB[b]) if listB[b].tweetID in tweet_SigmaWij_d: tweet_SigmaWij_d.get( listB[b].tweetID)[0] += 1 # numOfAppers tweet_SigmaWij_d.get( listB[b].tweetID )[1] += listB[b].Wij * factor # update Wij else: tweet_SigmaWij_d[listB[b].tweetID] = [ 1, listB[b].Wij * factor ] b += 1 else: union_l.append(listB[b]) # update Wij WITH B Wij, update numOfApeers (+1) in tweet tweet_SigmaWij_d.get(listB[b].tweetID)[0] += 1 # numOfAppers tweet_SigmaWij_d.get( listB[b].tweetID)[1] += listB[b].Wij * factor # update Wij b += 1 a += 1 while a < listA_len: union_l.append(listA[a]) a += 1 while b < listB_len: union_l.append(listB[b]) # update Wij WITH B Wij, update numOfApeers (+1) in tweet if listB[b].tweetID in tweet_SigmaWij_d: tweet_SigmaWij_d.get(listB[b].tweetID)[0] += 1 # numOfAppers tweet_SigmaWij_d.get( listB[b].tweetID)[1] += listB[b].Wij * factor # update Wij else: tweet_SigmaWij_d[listB[b].tweetID] = [1, listB[b].Wij * factor] b += 1 return union_l
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_list = query.split(" ") query_as_list = self._parser.text_operation(query_list) # extension by word net queary_list_after_word_net = self.q_word_net(query_as_list) #remove stop words query_as_list = self._parser.parse_sentence(queary_list_after_word_net) # find the docs relevant_docs = self._relevant_docs_from_posting( query_as_list) # return all the rel doc for the quiry relevant_docs = OrderedDict( sorted(relevant_docs.items(), key=lambda item: item[1], reverse=True)) relevant_docs = dict(itertools.islice(relevant_docs.items(), 2000)) #max is 2000 docs #relevant_docs_sort = self._ranker.rank_relevant_docs(relevant_docs, self._indexer, len(query_as_list)) relevant_docs_sort = self._ranker.rank_relevant_docs( relevant_docs, self._indexer, len(query_as_list)) n_relevant = len(relevant_docs) if k is not None: relevant_docs_sort = self.ranker.retrieve_top_k( relevant_docs_sort, self.k) return n_relevant, relevant_docs_sort # feel free to change the signature and/or implementation of this function # or drop altogether. def _relevant_docs_from_posting(self, query_as_list): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ relevant_docs = {} for word in query_as_list: posting_list = self._indexer.get_term_posting_list( word) # get all the twite with this word for doc in posting_list: tf = self._indexer.get_term_inverted_idx(word)[2] id = doc[0] if id not in relevant_docs.keys(): relevant_docs[id] = [1, []] # self._indexer.get_term_inverted_idx[word] tfidf = doc[4] * tf relevant_docs[id][1].append(tfidf) else: tfidf = doc[4] * tf relevant_docs[id][1].append(tfidf) relevant_docs[id][0] += 1 return relevant_docs """ this function expand the query by using word net get query as list and add words by word net """ def q_word_net(self, query): extend_query = [] extend_query.extend(query) for word in query: add_new_word = False counter_same_word = 0 syn_list = wn.synsets(word) for i in range(len(syn_list)): if syn_list[i].lemma_names() != []: for lemma in syn_list[i].lemma_names(): if lemma == word: continue else: new_word = lemma if "_" not in new_word: if self._indexer._is_term_exist_in_idx( new_word): extend_query.append(new_word) add_new_word = True break else: # more then one word new_word_list = new_word.split("_") for w in new_word_list: if self._indexer._is_term_exist_in_idx(w): extend_query.extend(new_word_list) add_new_word = True break if add_new_word == True: break counter_same_word += 1 if counter_same_word > 1: break if add_new_word == True: break if add_new_word == True: break elif i > 1: break return set(extend_query)
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ # parse query according to the same parsing rules of the corpus entities = {} term_dict = {} parsed_query = self._parser.parse_sentence(query, entities, stemming=self.stemming) self._parser.parse_capital_letters(parsed_query, term_dict) processed_query = [*term_dict.keys()] + [*entities.keys()] # perform spell correction if self.spell_correction: from spellchecker import SpellChecker spell_checker = SpellChecker() corrected_terms = [] # list all misspelled terms in the query misspelled_terms = spell_checker.unknown([*term_dict.keys()]) for term in misspelled_terms: # only correct terms that aren't in the inverted dictionary # terms in the dictionary are considered correct for retrieval if term not in self._indexer.inverted_idx: candidates = spell_checker.candidates(term) max_to_return = min(Searcher.TOP_N, len(candidates)) candidates = candidates[: max_to_return] # return only the top 3 results if term in candidates: # remove duplicate originally correct terms candidates.remove(term) for candidate in candidates: # remove corrections already in query if candidate in parsed_query: candidates.remove(candidate) corrected_terms.extend(candidates) processed_query += corrected_terms # extend query with corrected words if self.thesaurus: from nltk.corpus import lin_thesaurus as thes candidates = [] for term in processed_query: synsets = thes.synonyms(term) for synset in synsets: synonyms = [*synset[1]] if len(synonyms) > 0: max_to_return = min(Searcher.TOP_N, len(synonyms)) best_synonyms = synonyms[:max_to_return] for synonym in best_synonyms: if synonym != term and synonym not in processed_query and synonym in self._indexer.inverted_idx: candidates.append(synonym) # extend the query break processed_query += candidates if self.wordnet: from nltk.corpus import wordnet print("wordenting") candidates = [] for term in processed_query: print(f"term {term}:") synsests = wordnet.synsets(term) # retrieve best syn_sets max_to_return = min(Searcher.TOP_N, len(synsests)) synsests = synsests[0:max_to_return] print("returned synsets") skip = False for synset in synsests: for lemma in synset.lemmas( )[:max_to_return]: # possible synonyms print(f"possible lemma: {lemma.name()}") if lemma.name() != term and lemma.name( ) not in processed_query and lemma.name(): if lemma.name() in self._indexer.inverted_idx: candidates.append(lemma.name()) print(f"appended {lemma.name()}") skip = True break elif lemma.name().lower( ) in self._indexer.inverted_idx: candidates.append(lemma.name()) print(f"appended {lemma.name()}") skip = True break elif lemma.name().upper( ) in self._indexer.inverted_idx: candidates.append(lemma.name()) print(f"appended {lemma.name()}") skip = True break if skip: break parsed_query += candidates # dictionary for holding all relevant documents (at least one query term appeared in the document) # format: {document_id: score} relevant_docs = {} for term in processed_query: # check if term exists in inverted dictionary in either lower or upper form if term in self._indexer.inverted_idx: self.calculate_doc_scores(term, relevant_docs) elif term.islower() and term.upper() in self._indexer.inverted_idx: self.calculate_doc_scores(term.upper(), relevant_docs) elif term.isupper() and term.lower() in self._indexer.inverted_idx: self.calculate_doc_scores(term.lower(), relevant_docs) n_relevant = len(relevant_docs) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_list = self._parser.parse_sentence(query) if self.spellcheck: query_as_list = self.spell_check_query(query_as_list) if self.Sij_dic is not None: query_as_list.extend(self.expand_query_global_method(query_as_list)) if self.word_net: expend = [] for term in query_as_list: res = self.WordNet(term, query_as_list) if res is not None: expend.append(res) if len(expend) != 0: query_as_list.extend(expend) if self.word2vec: if self.local: lst_before_extend = self._relevant_docs_from_posting(query_as_list) add_to_query = Ranker.compute_extend_word(self._ranker, lst_before_extend) query_as_list.extend(add_to_query) self.counter_of_terms.clear() self.unique_tweets_num.clear() self.relevant_docs.clear() relevant_docs = self.second(query_as_list) ranked_doc_ids = Ranker.rank_relevant_docs_w2v(self._ranker, self._model, query_as_list, relevant_docs) return len(ranked_doc_ids), ranked_doc_ids if self.local: lst_before_extend = self._relevant_docs_from_posting(query_as_list) add_to_query = Ranker.compute_extend_word(self._ranker, lst_before_extend) query_as_list.extend(add_to_query) self.counter_of_terms.clear() self.unique_tweets_num.clear() self.relevant_docs.clear() lst_After_extend = self.second(query_as_list) ranked_doc_ids = Ranker.rank_relevant_docs(self._ranker, lst_After_extend) return len(ranked_doc_ids), ranked_doc_ids relevant_docs = self.second(query_as_list) ranked_doc_ids = Ranker.rank_relevant_docs(self._ranker, relevant_docs) return len(ranked_doc_ids), ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ # all_dicts = self._indexer.load_index('inverted_idx.pkl') inverted_index = self._indexer.inverted_idx posting = self._indexer.postingDict documents = self._indexer.documents dict_of_methods = self._indexer.dict_of_method if dict_of_methods['wordnet']== True: #wordnet method doc_query_app = self.finished_dict(query, inverted_index) # first parse query words list_of_query = doc_query_app.keys() words_to_add = {} # get each query word its synsets and add to query the ones that in inverted index for word in list_of_query: opt = wordnet.synsets(word) for i in range(len(opt)): check_word = opt[i].lemmas()[0].name() if check_word in doc_query_app.keys() or check_word in words_to_add.keys(): continue tested = self._indexer.check_upper_lower(inverted_index, check_word) if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys(): continue if tested[1] is True: words_to_add[tested[0]] = 0.0001 elif tested[1] is 'replace': words_to_add[tested[0].upper()] = 0.0001 doc_query_app.update(words_to_add) elif dict_of_methods['spell_correction']== True: spell = SpellChecker(case_sensitive=True) query_as_list = query.split() for index in range(len(query_as_list)): is_upper = False word = query_as_list[index] # if word from query not in inverted index look for correction- take the first one that is in inverted index if self._indexer.check_upper_lower(inverted_index, word)[1] is False: # word not in inverted index if word[0].isupper() is True: is_upper = True options = spell.candidates(word) is_found = False i = 0 options = list(options) while i < len(options): if self._indexer.check_upper_lower(inverted_index, options[i])[1] is True: corrected = options[i] is_found = True break i += 1 # corrected = spell.correction(word) if is_found is not False and corrected != query_as_list[index]: if is_upper is True: corrected = corrected.capitalize() query_as_list[index] = corrected doc_query_app = self.finished_dict(" ".join(query_as_list), inverted_index) elif dict_of_methods['word2vec'] == True: words_to_add = {} doc_query_app = self.finished_dict(query, inverted_index) query_as_list = query.split() insert_new_words = [] for word in query_as_list: if word in self._model.wv.wv.vocab: lst_sim_word_model = self._model.most_similar(word.lower()) for similiar_word in lst_sim_word_model: if similiar_word[1] > 0.33: insert_new_words.append(similiar_word[0]) # if len(insert_new_words) == 0: # continue idx = 0 while idx < len(insert_new_words): if insert_new_words[idx] in doc_query_app.keys() or insert_new_words[idx] in words_to_add.keys(): idx += 1 continue tested = self._indexer.check_upper_lower(inverted_index, insert_new_words[idx]) if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys(): idx += 1 continue if tested[1] is True: words_to_add[tested[0]] = 0.6 break elif tested[1] is 'replace': words_to_add[tested[0].upper()] = 0.6 break idx += 1 doc_query_app.update(words_to_add) elif dict_of_methods['thesaurus'] == True: doc_query_app = self.finished_dict(query, inverted_index) # first parse query words list_of_query = list(doc_query_app.keys()) words_to_add = {} # get each query word its synonyms and add to query the first that is in inverted index stop = set(stopwords.words('english')) results = [thes.synonyms(i, fileid="simN.lsp") for i in list_of_query if i not in stop] results_as_list = list(results) for words in results_as_list: inside_list = list(words) if len(inside_list) == 0: continue idx = 0 while idx < len(inside_list): if inside_list[idx] in doc_query_app.keys() or inside_list[idx] in words_to_add.keys(): idx += 1 continue tested = self._indexer.check_upper_lower(inverted_index, inside_list[idx]) if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys(): idx += 1 continue if tested[1] is True: words_to_add[tested[0]] = 0.0001 break elif tested[1] is 'replace': words_to_add[tested[0].upper()] = 0.0001 break idx += 1 doc_query_app.update(words_to_add) else: # dict_of_methods['parser'] = True doc_query_app = self.finished_dict(query, inverted_index) if len(doc_query_app) == 0: return [] dict_relevant_docs = self._relevant_docs_from_posting(doc_query_app, posting) ranked_doc_ids = Ranker.rank_relevant_docs(dict_relevant_docs , posting, documents, doc_query_app) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self._config = self._indexer.config self._method_class = None # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ self._indexer.load_index("idx_bench.pkl") query_as_list = self._parser.parse_sentence(query)[0] query_dict, max_tf_query = self.get_query_dict(query_as_list) expanded_query_dict = self._method_class.expand_query( query_dict, max_tf_query) return self.search_helper(expanded_query_dict, k, self._method_class.p_threshold, self._method_class.p_rel) # create {term : tf} for query def get_query_dict(self, tokenized_query): max_tf = 1 query_dict = {} for index, term in enumerate(tokenized_query): if term not in query_dict: query_dict[term] = 1 else: query_dict[term] += 1 if query_dict[term] > max_tf: max_tf = query_dict[term] for term in query_dict: query_dict[term] /= max_tf return query_dict, max_tf def relevant_docs_from_posting(self, query_dict, p_threshold=0): relevant_docs = {} query_vector = np.zeros(len(query_dict), dtype=float) full_cells_threshold = round(p_threshold * len(query_vector)) for idx, term in enumerate(list(query_dict.keys())): try: # added docs_index = self.get_doc_index() tweets_per_term = self._indexer.get_term_posting_tweets_dict( term) for tweet_id, vals in tweets_per_term.items(): doc_date = docs_index[tweet_id][1] if tweet_id not in relevant_docs.keys(): relevant_docs[tweet_id] = [ np.zeros(len(query_dict), dtype=float), doc_date ] # Wij - update tweet vector in index of term with tf-idf tf_tweet = vals[0] idf_term = self._indexer.get_term_idf(term) relevant_docs[tweet_id][0][idx] = tf_tweet * idf_term # Wiq - update query vector in index of term with tf-idf tf_query = query_dict[term] query_vector[idx] = tf_query * idf_term except: pass # OPTIMIZATIONS for doc in list(relevant_docs.keys()): if np.count_nonzero(relevant_docs[doc][0]) < full_cells_threshold: del relevant_docs[doc] return relevant_docs, query_vector def set_method_type(self, method_type): if method_type == '1': self._method_class = LocalMethod(self) elif method_type == '2': self._method_class = Thesaurus(self) elif method_type == '3': self._method_class = Wordnet(self) elif method_type == '4': self._method_class = MySpellCheker(self) # elif.. more methods def get_term_index(self): return self._indexer.inverted_idx_term def get_doc_index(self): return self._indexer.inverted_idx_doc def is_term_in_index(self, term): return term in self._indexer.inverted_idx_term def search_helper(self, query_dict, k, p_threshold=0, p_relevant=0): relevant_docs, query_vector = self.relevant_docs_from_posting( query_dict, p_threshold) n_relevant = len(relevant_docs) ranked_docs = self._ranker.rank_relevant_docs(relevant_docs, query_vector) return n_relevant, self._ranker.retrieve_top_k(ranked_docs, k, p_relevant)
class Searcher: # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker(indexer) self._model = model self.word_net = WordNet(indexer) # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ #parse the query and get list of term, list of entity query_as_list, entity_dict = self._parser.parse_sentence(query) entity_as_list = list(entity_dict.keys()) #start wordnet query_extand = self.word_net.query_expan(query_as_list) query_as_list.extend(query_extand) #get the relevant docs id, info of relevant term relevant_docs_query, relevant_terms_query = self._relevant_docs_from_posting( query_as_list) relevant_docs_entity, relevant_terms_entity = self._relevant_docs_to_entity( entity_as_list) #combine dict of docs and entity full_relevant_doc = {**relevant_docs_query, **relevant_docs_entity} full_relevant_term = {**relevant_terms_query, **relevant_terms_entity} n_relevant = len(full_relevant_doc) #start ranker ranked_doc_ids = self._ranker.rank_relevant_docs( relevant_doc=full_relevant_doc, relevant_terms=full_relevant_term, query_terms=query_as_list) try: doc_id, doc_rank = zip(*ranked_doc_ids) except: doc_id = () return n_relevant, list(doc_id) # feel free to change the signature and/or implementation of this function # or drop altogether. def _relevant_docs_from_posting(self, query_as_list): """ This function loads the posting list and count the amount of relevant documents per term. :param query_as_list: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ relevant_terms = {} relevant_docs = {} for term in query_as_list: posting_list_of_term = [] #if term is numeric get term info if term.isnumeric(): list = self._indexer.get_term_posting_list(term) if not list is None: posting_list_of_term.extend(list) else: #if term no numeric get info to upper&lower list = self._indexer.get_term_posting_list(term) if not list is None: posting_list_of_term.extend(list) list = self._indexer.get_term_posting_list(term.casefold()) if not list is None: posting_list_of_term.extend(list) #add to relevant doc and save the info of term for doc_id in posting_list_of_term: if doc_id in relevant_docs: relevant_docs[doc_id[0]] += 1 else: relevant_docs[doc_id[0]] = 1 relevant_terms[term] = posting_list_of_term return relevant_docs, relevant_terms def _relevant_docs_to_entity(self, entity_as_list): relevant_docs = {} relevant_terms = {} for term in entity_as_list: posting_list = self._indexer.get_term_posting_list(term) if not posting_list is None: for doc_id, tf in posting_list: df = relevant_docs.get(doc_id, 0) relevant_docs[doc_id] = df + 1 relevant_terms[term] = posting_list return relevant_docs, relevant_terms
class Searcher: __slots__ = [ '_parser', '_indexer', '_ranker', '_the_count', '_model', '_min_relevant', '_ext_val', '_wordnet_count' ] # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. The model # parameter allows you to pass in a precomputed model that is already in # memory for the searcher to use such as LSI, LDA, Word2vec models. # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME. def __init__(self, parser, indexer, config, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker(config) self._model = model self._the_count = config.the_count self._wordnet_count = config.wordnet_count self._min_relevant = config.min_relevant self._ext_val = config.ext_val def CalculateW(self, query, extenders): output = {term: 1 for term in query} for term in extenders: if term not in output: output[term] = 0 output[term] += self._ext_val return output def wordNet(self, word): syn = set() for syn_set in wordnet.synsets(word): for lemma in syn_set.lemmas(): syn.add(lemma.name()) if lemma.antonyms(): syn.add(lemma.antonyms()[0].name()) if len(syn) >= self._wordnet_count: return syn return syn # DO NOT MODIFY THIS SIGNATURE # You can change the internal implementation as you see fit. def search(self, query, k=None, methods=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relevant and the last is the least relevant result. """ # spell corrections if 1 in methods: spell = SpellChecker() query = ' '.join( [spell.correction(word) for word in query.split()]) query_terms = self._parser.Tokenize(query).keys() extenders = set() # wordNet if 2 in methods: for word in query_terms: for ex_word in self.wordNet(word.text): extenders.add(self._parser.add_to_dict(ex_word)) # lin_thesaurus if 3 in methods: for word in query_terms: for ex_word in list(thes.synonyms( word.text)[1][1])[:self._the_count]: extenders.add(self._parser.add_to_dict(ex_word)) extenders = {extender for extender in extenders if extender} w_of_term_in_query = self.CalculateW(query_terms, extenders) relevant_docs = self._relevant_docs_from_posting( w_of_term_in_query.keys()) ranked_doc_ids = self._ranker.rank_relevant_docs( relevant_docs, k, w_of_term_in_query) return len(ranked_doc_ids), ranked_doc_ids # feel free to change the signature and/or implementation of this function # or drop altogether. def _relevant_docs_from_posting(self, query_terms): """ This function loads the posting list and count the amount of relevant documents per term. :param query_terms: parsed query tokens :return: dictionary of relevant documents mapping doc_id to document frequency. """ relevant_docs = {} for term in query_terms: if len(term.postings) == 0: continue idf = math.log2(len(term.postings) / len(self._indexer.documents)) for doc_id, tf in term.postings: if doc_id not in relevant_docs.keys(): relevant_docs[doc_id] = {} relevant_docs[doc_id][term] = tf * idf # wiq return { doc: relevant_docs[doc] for doc in relevant_docs if len(relevant_docs[doc]) >= min( self._min_relevant, len(query_terms)) }