class JM_Retreiver: def __init__(self): self.helper = Helper() self.unigram_inverted_index = self.helper.unigram_inverted_index self.corpus_term_count = self.helper.corpus_frequency( self.unigram_inverted_index) self.CONSTANT = 0.35 self.number_of_ranked_docs = 100 def run(self, query, query_id): #query = self.helper.parse_query(query) terms = query.split() print "query is", query print "terms are", terms doc_scores = defaultdict(float) doc_list = [] for term in terms: if term in self.unigram_inverted_index.keys(): inverted_list = self.unigram_inverted_index[term] for doc_id in inverted_list.keys(): if doc_id not in doc_list: doc_list.append(doc_id) else: print term print "term ignored not in corpus" for term in terms: if term in self.unigram_inverted_index.keys(): for doc_id in doc_list: score = self.calculate_document_score(doc_id, term) doc_scores[doc_id] += score self.sort_scores(query, query_id, doc_scores) def sort_scores(self, query, query_id, doc_scores): sorted_scores = sorted(doc_scores.items(), key=operator.itemgetter(1), reverse=True) self.save_to_file(query, query_id, sorted_scores) def save_to_file(self, query, query_id, tf_dict): count = 1 file_name = 'JM_Output_Stemming/' + str(query_id) + '.txt' with open(file_name, 'w') as f: for word in tf_dict: if count <= self.number_of_ranked_docs: f.write(str(query_id)) f.write(" ") f.write("Q0") f.write(" ") f.write(word[0]) f.write(" ") f.write(str(count)) f.write(" ") f.write(str(word[1])) f.write(" ") f.write("LM_JM_Stemming_Unigram") f.write("\n") count += 1 else: break def calculate_document_score(self, doc_id, term): first_term = (1 - self.CONSTANT) * ( self.get_number_of_occurence_in_document(term, doc_id) / self.get_total_number_of_terms_in_document(doc_id)) second_term = self.CONSTANT * ( self.number_of_occurence_in_corpus(term) / self.get_total_number_of_terms_in_corpus()) score = math.log((first_term + second_term)) return score # Cqi def number_of_occurence_in_corpus(self, term): return self.corpus_term_count[term] * 1.0 # |C| def get_total_number_of_terms_in_corpus(self): return self.helper.total_number_of_terms_corpus * 1.0 # |D| def get_total_number_of_terms_in_document(self, doc_id): return self.helper.number_of_terms_doc[doc_id] * 1.0 # fqi, D def get_number_of_occurence_in_document(self, term, doc_id): documents_dict = self.unigram_inverted_index[term] if doc_id in documents_dict.keys(): return documents_dict[doc_id] * 1.0 return 0 def JM_test(self): queries = self.helper.get_stemmed_queries() for key in queries.keys(): self.run(queries[key], key)