def run_query(self, query, query_id=None): query_result = dict() print query_id for term in query: if term in self.index: print 'Term:', term doc_dict = self.index[term] # retrieve index entry i = 0 for docid, freq in doc_dict.items(): # for each document and its word frequency # print 'docid', docid # if we are using fiqa dataset, we need to choose only documents that are a candidate for this query if query_id is not None: shouldConsider = False for question_candidate, isRelevant in self.dev_candidates[query_id]: # print docid, question_candidate if docid == question_candidate: shouldConsider = True break # else we need not care and just consider the word else: shouldConsider = True if shouldConsider: print 'shouldConsider', shouldConsider if shouldConsider: if i % 100 == 0: print 'Doc Id: ', docid i += 1 score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt), dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score if docid in query_result: # this document has already been scored once query_result[docid] += score else: query_result[docid] = score print('=======\n') return query_result
def run_query(self, query, k1_value, k2_value, b_value): query_result = dict() query_fre = dict() for word in query: if not query_fre.has_key(word): query_fre[word] = 1 else: query_fre[word] += 1 # print query_fre for term in query: if term in self.index: doc_dict = self.index[term] # retrieve index entry for docid, freq in doc_dict.iteritems( ): #for each document and its word frequency score = score_BM25(n=len(doc_dict), f=freq, qf=query_fre[term], r=0, N=len(self.dlt), dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length(), k1=k1_value, k2=k2_value, b=b_value) # calculate score if docid in query_result: #this document has already been scored once query_result[docid] += score else: query_result[docid] = score return query_result
def run_query(self, query): query_result = dict() for term in query: if term in self.index: doc_dict = self.index[term] # retrieve index entry for docid, freq in doc_dict.items(): #for each document and its word frequency score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt), dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score if docid in query_result: #this document has already been scored once query_result[docid] += score else: query_result[docid] = score return query_result
def run_query(self, query): query_result = dict() for term in query: if term in self.index: doc_dict = self.index[term] # retrieve index entry for docid, freq in doc_dict.iteritems(): #for each document and its word frequency score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt), dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score if docid in query_result: #this document has already been scored once query_result[docid] += score else: query_result[docid] = score return query_result
def run_query(self, query): query_result = dict() for term in query: if term in self.index: doc_dict = self.index[term] for docid, freq in doc_dict.iteritems(): score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt), dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) if docid in query_result: query_result[docid] += score else: query_result[docid] = score return query_result
def run_query(self, query): query_result = dict() for term in query: # 遍历问题中的每个词 if term in self.index: doc_dict = self.index[term] # 取出当前词的在每篇文章中的统计次数 for docid, freq in doc_dict.items(): # 文章id 当前问题中的这个词在当前文章中出现的次数 score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt), dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score) if docid in query_result: query_result[docid] += score else: query_result[docid] = score return query_result
def run_query(self, query): query_result = dict() for term in query: # for each word in query if term in self.index: # doc words in `self.index`, self.index[word][docid] doc_dict = self.index[term] # retrieve index entry """ for each document and its (the current query word's) word frequency, """ for docid, freq in doc_dict.items(): score = score_BM25( n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt), dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score if docid in query_result: # this document has already been scored once query_result[docid] += score else: query_result[docid] = score """For current query word, each doc score.""" return query_result
def run_query(self, query, query_run_count, results_directory): query_result = dict() doc_category_count = dict() # key: doc_id, val: cat_counts_dict term_count = 0 for term in query: # look for term weights, if we don't find one assign a 1 term_count += 1 if self.keywords.get(term): weight = self.keywords.get(term)/100 if self.keyword_types.get(term): keyword_type = self.keyword_types.get(term) # print("keyword Type: {}".format(keyword_type)) else: weight = 1 # print('Term: {0} Weight:{1}'.format(term, weight)) weight_string = term + ',' + str(weight*100) + '\n' self.write_weights_file(results_directory, query_run_count, weight_string) if term in self.index: doc_dict = self.index[term] # retrieve index entry for doc_id, freq in doc_dict.items(): # for each document and its word frequency # print('41', doc_id, term, freq) if doc_id in doc_category_count: cat_counts_for_doc = doc_category_count[doc_id] else: cat_counts_for_doc = defaultdict(int) doc_category_count[doc_id] = cat_counts_for_doc cat_counts_for_doc[keyword_type] += freq # print("50:", doc_category_count) # print('\t docID: {0} Freq: {1}'.format(doc_id, freq)) # print('doc ID: {0}'.format(doc_id)) # print('term freq in this.doc: {0}'.format(freq)) # calculate score score = score_BM25(weight=weight, n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt), dl=self.dlt.get_length(doc_id), avdl=self.dlt.get_average_length()) if doc_id in query_result: # this document has already been scored once query_result[doc_id] += score else: query_result[doc_id] = score # print('\t docID: {0} Term: {3} Freq: {1} Score: {2}'.format(doc_id, freq, score, term)) detail_string = str(doc_id) + ',' + str(freq) + ',' + str(score) + ',' + term + ',' \ + str(weight) + ',' + keyword_type self.write_details_file(results_directory, query_run_count, detail_string) # dump cat_count_dicts # print('=============================================') #for doc_id, cat_count_dict in doc_category_count.items(): # print('Doc #:', doc_id) # print(cat_count_dict) # threat_found = False # for key, count in sorted(cat_count_dict.items(), reverse=True, key=lambda tup: tup[1]): # if key.lower() == 'threat': # threat_found = True # print('===========THREAT FOUND=============') # i = 0 # for key, count in sorted(cat_count_dict.items(), reverse=True, key=lambda tup: tup[1] ): # if i == 0: # print('doc: {0}, primary: {1} - {2}'.format(doc_id, key, count)) # elif i == 1: # print('\t\t Support: {0} - {1}'.format(key, count)) # if i > 1: # break # i += 1 # print('\n') self.write_category_file(results_directory, query_run_count, doc_category_count) return query_result