def get_docterms_matrix(self): result = defaultdict(float) N = self.corpus_size avg_doc_length = self.avg_doc_length for id, doc in self.patent_info.items(): for term in self.terms: tfq = self.get_tf(term) tfd = doc.get_tf(term) df = int(self.terms_dictionary[term][0]) if term in self.terms_dictionary else 0.0 cf = int(self.terms_dictionary[term][1]) if term in self.terms_dictionary else 0.0 doc_length = len(doc.terms) if df > 0.0: ''' result[id] += (util.bm25(tfd, df, N, doc_length, avg_doc_length) * util.bm25(tfq, df, N, doc_length, avg_doc_length)) result[id] += util.dfr(tfq, tfd, df, N, doc_length, avg_doc_length) ''' result[id] += util.es(tfq, tfd, df, N, cf, doc_length, avg_doc_length) return result
def get_ranked_docs(self): result = defaultdict(float) N = self.initial_query.corpus_size avg_doc_length = self.initial_query.avg_doc_length for id, doc in self.patent_info.items(): for term in self.terms: tfq = self.get_tf(term) tfd = doc.get_tf(term) df = int(self.terms_dictionary[term][0]) if term in self.terms_dictionary else 0.0 cf = int(self.terms_dictionary[term][1]) if term in self.terms_dictionary else 0.0 doc_length = len(doc.terms) if df > 0.0: ''' result[id] += (util.bm25(tfd, df, N, doc_length, avg_doc_length) * util.bm25(tfq, df, N, doc_length, avg_doc_length)) result[id] += util.dfr(tfq, tfd, df, N, doc_length, avg_doc_length) ''' result[id] += util.es(tfq, tfd, df, N, cf, doc_length, avg_doc_length) result = sorted(result, key=result.get, reverse=True) return result