class Searcher: """ Searcher is a class dealing with real-time querying. It implements the ranked retrieval based on the VSM(Vector Space Model). It also support Phrasal Queries and Pivoted Normalized Document Length. Args: dictionary_file: the file path of the dictionary postings_file: the file path of the postings rate: the penalty rate of the pivoted normalized document length expand: boolean indicator for using Query Expansion feedback: boolean indicator for using Relevance Feedback pivoted: boolean indicator for using pivoted normalized document length score: output the score of each result """ def __init__(self, dictionary_file, postings_file, expand=False, feedback=True, rate=0.01, pivoted=False, score=False): self.dictionary_file = dictionary_file self.postings_file = postings_file self.rate = rate self.pivoted = pivoted self.score = score self.stemmer = PorterStemmer() self.indexer = Indexer(dictionary_file, postings_file) self.refiner = Refiner(indexer=self.indexer, expand=expand, feedback=feedback) self.feedback = feedback self.a, _, _, _ = self.indexer.LoadDict() """ Search and return docIds according to the boolean expression. Args: query: the query string relevant_docs: relevanct doc used for relevance feedback Returns: result: the list of K most relevant docIds in response to the query urls: the url corresponding to each result score: the score corresponding to each result """ def search(self, query, relevant_docs): # step 1: let refiner to refine the query and get query_infos query_infos, postings_lists = self.refiner.refine(query, relevant_docs) # step 2: get candidate docs that need to rank(phrasal query) # step 2-1: get all the docs that contains all the terms in the query self._get_intersection(query_infos, postings_lists) # step 2-2: judging every doc whether it contains the phrase self._judge(query_infos, postings_lists) # step 3: rank documents based on VSM and relevance feedback based on top doc to get final result # step 3-1: rank documents get the result result, score = self.rank(query_infos, postings_lists) # step 3-2: relevance feedback based on top doc if (self.feedback): self.refiner._feedback(query_infos, result[:5], postings_lists) # step 3-3: rank again using new query vector result, score = self.rank(query_infos, postings_lists) # step 4: fetch url from result urls = self.indexer.LoadUrls(result) # step 4: return the result return result, urls, score """ Rank the documents and return the K most relevant docIds. The result should be in the order of relevant. Args: query_infos: a list of instances of QueryInfo that contains the query postings_lists: the dictionary with terms to posting lists mapping Returns: result: the list of K most relevant docIds in response to the query score: the list of the scores corresponding to the docIds """ def rank(self, query_infos, postings_lists): total_scores = defaultdict(lambda: 1) for query_info in query_infos: # step 1: Initialize variables terms = query_info.terms scores = defaultdict(lambda: 0) query_vector = query_info.query_vector # step 2: processing every document and every term for i, term in enumerate(terms): candidates = query_info.candidates postings_list = postings_lists[term] postings = postings_list[1] weights = postings_list[2] for j in range(0, len(postings)): doc = postings[j] if query_info.is_phrase and (doc not in candidates): continue weight = weights[j] scores[doc] += weight * query_vector[i] # step 3: use pivoted document length """ for doc in scores: length = self.total_doc[doc] if self.pivoted: piv = 1 - self.rate + self.rate * length / self.average scores[doc] /= length * piv else: scores[doc] /= length """ # step 4: update total scores for doc in scores: total_scores[doc] += scores[doc] # use pagerank to change the weight self._pagerank(total_scores) # step 4: get the topK docs from the heap heap = [(total_scores[doc], -doc) for doc in total_scores] heap = heapq.nlargest(len(total_scores), heap, key=lambda x: x) result = [-item[1] for item in heap] score = [] if self.score: score = [item[0] for item in heap] # step 5: return the topK docs return result, score """ Use array a generated from pagerank algorithm to change the weight Args: total_scores: dict of doc and score Returns: total_scores: processed dict of doc and score """ def _pagerank(self, total_scores): # normalize the total_scores try: sum_score = sum(total_scores.values()) # min_score = min(total_scores.values()) # max_score = max(total_scores.values()) except ValueError: return total_scores else: # to_be_divided_by = max_score - min_score to_be_divided_by = sum_score for key in total_scores.keys(): total_scores[key] = (total_scores[key]) / to_be_divided_by # normalize the a value # min_score = np.min(self.a) # max_score = np.max(self.a) # to_be_divided_by = max_score - min_score # # all same value in array a # if (to_be_divided_by == 0): # return total_scores # self.a = (self.a - min_score)/to_be_divided_by # use pagerank to change the weight for key in total_scores.keys(): total_scores[key] = (total_scores[key] + self.a[0][key]) / 2 return total_scores """ Get the intersection of docs Args: query_infos: a list of instances of QueryInfo that contains the query postings_lists: the dictionary with terms to posting lists mapping """ def _get_intersection(self, query_infos, postings_lists): for query_info in query_infos: terms = query_info.terms if not query_info.is_phrase: continue if len(terms) == 0: query_info.candidates = [] continue # optimize the order of the merge costs = [] for term in terms: postings = postings_lists[term][1] costs.append((term, len(postings))) costs.sort(key=lambda key: key[1]) # perform pairwise merge result = postings_lists[costs[0][0]][1] for i in range(1, len(costs)): term = costs[i][0] postings = postings_lists[term][1] p1 = p2 = 0 len1, len2 = len(result), len(postings) temp = array.array('i') while p1 < len1 and p2 < len2: doc1 = result[p1] doc2 = postings[p2] if doc1 == doc2: temp.append(doc1) p1, p2 = p1 + 1, p2 + 1 elif doc1 < doc2: p1 += 1 else: p2 += 1 result = temp # update the candidates query_info.candidates = set(result) """ Judging whether candidate documents contain the phrase Args: query_infos: a list of instances of QueryInfo that contains the query postings_lists: the dictionary with terms to posting lists mapping """ def _judge(self, query_infos, postings_lists): for query_info in query_infos: tokens = query_info.tokens if not query_info.is_phrase: continue if len(tokens) <= 1: continue positions = defaultdict(lambda: []) candidates = query_info.candidates # get postions for docs for i, token in enumerate(tokens): postings_list = postings_lists[token] postings = postings_list[1] length = len(postings) for j in range(0, length): docId = postings[j] if docId in candidates: positions[docId].append(postings_list[3][j]) # judging every doc ans = set() for doc in positions: position = positions[doc] pointers = [0] * len(position) index = 1 flag = False prev_pos = position[0][0] while True: pointer = pointers[index] length = len(position[index]) while pointer + 1 < length: tmp = position[index][pointer + 1] if tmp <= prev_pos + 1: pointer += 1 else: break pointers[index] = pointer cur_pos = position[index][pointer] if cur_pos != prev_pos + 1: index -= 1 pointers[index] += 1 if pointers[index] >= len(position[index]): break if index == 0: index += 1 pointer = pointers[index - 1] prev_pos = position[index - 1][pointer] continue else: prev_pos = cur_pos index += 1 if index >= len(position): flag = True break if flag: ans.add(doc) query_info.candidates = ans