def term2idf_unstemmed(self, term): term = J.A_AnalyzerUtils.analyze(self._get_analyzer(), term).toArray() if len(term) == 1: df = self._reader().docFreq( J.L_Term(J.A_IndexArgs.CONTENTS, term[0])) return math.log((self._reader().numDocs() + 1) / (df + 1)) return 0. # stop word; very common
def term2idf_unstemmed(self, term): term = J.A_AnalyzerUtils.tokenize(self._get_analyzer(), term).toArray() if len(term) == 1: df = self._reader().docFreq(J.L_Term(self._primary_field, term[0])) doc_count = self.collection_stats().docCount() return math.log(1 + (doc_count - df + 0.5) / (df + 0.5)) return 0. # stop word; very common
def doc_freq(self, term): return self._reader().docFreq(J.L_Term(self._primary_field, term))