def gen_freqstats(argv): """ Generate frequency stats """ parser = argparse.ArgumentParser( prog='gen_freqstats', formatter_class=argparse.RawDescriptionHelpFormatter, add_help=False, ) parser.add_argument('-m', dest='model', metavar='DIR', required=True, help='store the processed data in DIR') parser.add_argument('index_path', help='path to Galago index') parser.add_argument('index_part', help='index part: postings.krovetz or postings.porter') args = parser.parse_args(argv) model = summaryrank.Model(args.model) term_set = set() for text, _ in model.load_topics('topics_stem'): term_set.update(text.split()) for text, _ in model.load_sentences('sentences_stem'): term_set.update(text.split()) print >>sys.stderr, 'found {} stems'.format(len(term_set)) GalagoIndexDump.dump(model.get_path('freq_stats'), args.index_path, args.index_part, term_set)
def compute(self, model): result = [] if not self._freq_stats: self._freq_stats = GalagoIndexDump.load(model.get_path('freq_stats')) collection_len = self._freq_stats.collection_length() topics_stem = model.load_topics('topics_stem') queries = dict((m['qid'], text.split()) for text, m in topics_stem) sentences_stem = model.load_sentences('sentences_stem') for text, m in sentences_stem: stems = text.split() sentence_tf = collections.Counter(stems) sentence_len = len(stems) score = float(0) for query_stem in queries[m['qid']]: cf = self._freq_stats.cf(query_stem) if cf == 0: continue score += math.log( float(sentence_tf[query_stem] + self.mu * float(cf) / collection_len) / (sentence_len + self.mu)) result.append(score) return result
def compute(self, model): result = [] if not self._freq_stats: self._freq_stats = GalagoIndexDump.load(model.get_path('freq_stats')) N = self._freq_stats.num_docs() topics_stem = model.load_topics('topics_stem') queries = dict((m['qid'], text.split()) for text, m in topics_stem) for text, m in model.load_sentences('sentences_stem'): stems = text.split() sentence_tf = collections.Counter(stems) sentence_len = len(stems) score = float(0) for query_stem in queries[m['qid']]: df = self._freq_stats.df(query_stem) comp1 = math.log(float(N - df + 0.5) / (df + 0.5)) comp2 = float(sentence_tf[query_stem] * (self.k1 + 1)) comp3 = sentence_tf[query_stem] + \ self.k1 * (1 - self.b + float(self.b * sentence_len) / self.avgdl) score += comp1 * comp2 / comp3 result.append(score) return result