Exemple #1
0
 def run_query(self, query, query_id=None):
     query_result = dict()
     print query_id
     for term in query:
         if term in self.index:
             print 'Term:', term
             doc_dict = self.index[term]  # retrieve index entry
             i = 0
             for docid, freq in doc_dict.items():  # for each document and its word frequency
                 # print 'docid', docid
                 # if we are using fiqa dataset, we need to choose only documents that are a candidate for this query
                 if query_id is not None:
                     shouldConsider = False
                     for question_candidate, isRelevant in self.dev_candidates[query_id]:
                         # print docid, question_candidate
                         if docid == question_candidate:
                             shouldConsider = True
                             break
                 # else we need not care and just consider the word
                 else:
                     shouldConsider = True
                 if shouldConsider:
                     print 'shouldConsider', shouldConsider
                 if shouldConsider:
                     if i % 100 == 0:
                         print 'Doc Id: ', docid
                     i += 1
                     score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt),
                                        dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length())  # calculate score
                     if docid in query_result:  # this document has already been scored once
                         query_result[docid] += score
                     else:
                         query_result[docid] = score
             print('=======\n')
     return query_result
Exemple #2
0
    def run_query(self, query, k1_value, k2_value, b_value):
        query_result = dict()
        query_fre = dict()
        for word in query:
            if not query_fre.has_key(word):
                query_fre[word] = 1
            else:
                query_fre[word] += 1
#		print query_fre
        for term in query:
            if term in self.index:
                doc_dict = self.index[term]  # retrieve index entry
                for docid, freq in doc_dict.iteritems(
                ):  #for each document and its word frequency
                    score = score_BM25(n=len(doc_dict),
                                       f=freq,
                                       qf=query_fre[term],
                                       r=0,
                                       N=len(self.dlt),
                                       dl=self.dlt.get_length(docid),
                                       avdl=self.dlt.get_average_length(),
                                       k1=k1_value,
                                       k2=k2_value,
                                       b=b_value)  # calculate score
                    if docid in query_result:  #this document has already been scored once
                        query_result[docid] += score
                    else:
                        query_result[docid] = score
        return query_result
Exemple #3
0
 def run_query(self, query):
     query_result = dict()
     for term in query:
         if term in self.index:
             doc_dict = self.index[term] # retrieve index entry
             for docid, freq in doc_dict.items(): #for each document and its word frequency
                 score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt),
                                    dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score
                 if docid in query_result: #this document has already been scored once
                     query_result[docid] += score
                 else:
                     query_result[docid] = score
     return query_result
Exemple #4
0
	def run_query(self, query):
		query_result = dict()
		for term in query:
			if term in self.index:
				doc_dict = self.index[term] # retrieve index entry
				for docid, freq in doc_dict.iteritems(): #for each document and its word frequency
					score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt),
									   dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length()) # calculate score
					if docid in query_result: #this document has already been scored once
						query_result[docid] += score
					else:
						query_result[docid] = score
		return query_result
Exemple #5
0
 def run_query(self, query):
     query_result = dict()
     for term in query:
         if term in self.index:
             doc_dict = self.index[term]
             for docid, freq in doc_dict.iteritems():
                 score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt),
                                    dl=self.dlt.get_length(docid), avdl=self.dlt.get_average_length())
                 if docid in query_result:
                     query_result[docid] += score
                 else:
                     query_result[docid] = score
     return query_result
Exemple #6
0
 def run_query(self, query):
     query_result = dict()
     for term in query:   # 遍历问题中的每个词
         if term in self.index:
             doc_dict = self.index[term]   # 取出当前词的在每篇文章中的统计次数
             for docid, freq in doc_dict.items():
                 # 文章id  当前问题中的这个词在当前文章中出现的次数
                 score = score_BM25(n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt),
                                    dl=self.dlt.get_length(docid),
                                    avdl=self.dlt.get_average_length())  # calculate score)
                 if docid in query_result:
                     query_result[docid] += score
                 else:
                     query_result[docid] = score
     return query_result
Exemple #7
0
    def run_query(self, query):
        query_result = dict()
        for term in query:  # for each word in query
            if term in self.index:  # doc words in `self.index`, self.index[word][docid]
                doc_dict = self.index[term]  # retrieve index entry
                """
				for each document and its (the current query word's) word frequency, 
				"""
                for docid, freq in doc_dict.items():
                    score = score_BM25(
                        n=len(doc_dict),
                        f=freq,
                        qf=1,
                        r=0,
                        N=len(self.dlt),
                        dl=self.dlt.get_length(docid),
                        avdl=self.dlt.get_average_length())  # calculate score
                    if docid in query_result:  # this document has already been scored once
                        query_result[docid] += score
                    else:
                        query_result[docid] = score
        """For current query word, each doc score."""
        return query_result
Exemple #8
0
	def run_query(self, query, query_run_count, results_directory):
		query_result = dict()
		doc_category_count = dict()  # key: doc_id, val: cat_counts_dict
		term_count = 0
		for term in query:
			# look for term weights, if we don't find one assign a 1
			term_count += 1
			if self.keywords.get(term):
				weight = self.keywords.get(term)/100
				if self.keyword_types.get(term):
					keyword_type = self.keyword_types.get(term)
					#  print("keyword Type: {}".format(keyword_type))
			else:
				weight = 1
				#  print('Term: {0} Weight:{1}'.format(term, weight))
			weight_string = term + ',' + str(weight*100) + '\n'
			self.write_weights_file(results_directory, query_run_count, weight_string)

			if term in self.index:
				doc_dict = self.index[term]  # retrieve index entry
				for doc_id, freq in doc_dict.items():  # for each document and its word frequency
					# print('41', doc_id, term, freq)
					if doc_id in doc_category_count:
						cat_counts_for_doc = doc_category_count[doc_id]
					else:
						cat_counts_for_doc = defaultdict(int)
						doc_category_count[doc_id] = cat_counts_for_doc

					cat_counts_for_doc[keyword_type] += freq

					# print("50:", doc_category_count)
					# print('\t docID: {0} Freq: {1}'.format(doc_id, freq))
					# print('doc ID: {0}'.format(doc_id))
					# print('term freq in this.doc: {0}'.format(freq))
					# calculate score
					score = score_BM25(weight=weight, n=len(doc_dict), f=freq, qf=1, r=0, N=len(self.dlt),
									   dl=self.dlt.get_length(doc_id), avdl=self.dlt.get_average_length())

					if doc_id in query_result:  # this document has already been scored once
						query_result[doc_id] += score
					else:
						query_result[doc_id] = score
					# print('\t docID: {0} Term: {3}  Freq: {1} Score:  {2}'.format(doc_id, freq, score, term))
					detail_string = str(doc_id) + ',' + str(freq) + ',' + str(score) + ',' + term + ',' \
									+ str(weight) + ',' + keyword_type
					self.write_details_file(results_directory, query_run_count, detail_string)

			# dump cat_count_dicts
		# print('=============================================')
		#for doc_id, cat_count_dict in doc_category_count.items():
			# print('Doc #:', doc_id)
			# print(cat_count_dict)

		#	threat_found = False
		#	for key, count in sorted(cat_count_dict.items(), reverse=True, key=lambda tup: tup[1]):
		#		if key.lower() == 'threat':
		#			threat_found = True
		#			print('===========THREAT FOUND=============')
			# i = 0
			# for key, count in sorted(cat_count_dict.items(), reverse=True, key=lambda tup: tup[1] ):
			# 	if i == 0:
			# 		print('doc: {0}, primary: {1} - {2}'.format(doc_id, key, count))
			# 	elif i == 1:
			# 		print('\t\t Support: {0} - {1}'.format(key, count))
			# 	if i > 1:
			# 		break
			# 	i += 1
			# print('\n')
		self.write_category_file(results_directory, query_run_count, doc_category_count)
		return query_result