Example #1
0
    def get_docterms_matrix(self):
        result = defaultdict(float)
        N = self.corpus_size
        avg_doc_length = self.avg_doc_length
        for id, doc in self.patent_info.items():
            for term in self.terms:
                tfq = self.get_tf(term)
                tfd = doc.get_tf(term)
                df = int(self.terms_dictionary[term][0]) if term in self.terms_dictionary else 0.0
                cf = int(self.terms_dictionary[term][1]) if term in self.terms_dictionary else 0.0
                doc_length = len(doc.terms)

                if df > 0.0:
                    '''
                    result[id] += (util.bm25(tfd, df, N, doc_length, avg_doc_length) *
                                   util.bm25(tfq, df, N, doc_length, avg_doc_length))

                    result[id] += util.dfr(tfq, tfd, df, N, doc_length, avg_doc_length)


                    '''
                    result[id] += util.es(tfq, tfd, df, N, cf, doc_length, avg_doc_length)

        return result
Example #2
0
    def get_ranked_docs(self):
        result = defaultdict(float)
        N = self.initial_query.corpus_size
        avg_doc_length = self.initial_query.avg_doc_length
        for id, doc in self.patent_info.items():
            for term in self.terms:
                tfq = self.get_tf(term)
                tfd = doc.get_tf(term)
                df = int(self.terms_dictionary[term][0]) if term in self.terms_dictionary else 0.0
                cf = int(self.terms_dictionary[term][1]) if term in self.terms_dictionary else 0.0
                doc_length = len(doc.terms)

                if df > 0.0:
                    '''
                    result[id] += (util.bm25(tfd, df, N, doc_length, avg_doc_length) *
                                   util.bm25(tfq, df, N, doc_length, avg_doc_length))

                    result[id] += util.dfr(tfq, tfd, df, N, doc_length, avg_doc_length)

                    '''
                    result[id] += util.es(tfq, tfd, df, N, cf, doc_length, avg_doc_length)

        result = sorted(result, key=result.get, reverse=True)
        return result