def __init__(self, args): super(BM25Score, self).__init__(args) self.k1 = args.bm25_k1 self.b = args.bm25_b self.avgdl = args.bm25_avgdl self._freq_stats = GalagoIndex( args.index, 'postings.krovetz') if args.index else None
def gen_freqstats(argv): """ Generate frequency stats """ parser = argparse.ArgumentParser( prog='gen_freqstats', formatter_class=argparse.RawDescriptionHelpFormatter, add_help=False, ) parser.add_argument('-m', dest='model', metavar='DIR', required=True, help='store the processed data in DIR') parser.add_argument('index_path', help='path to Indri/Galago index') parser.add_argument( 'index_part', nargs='?', help='(Galago only) index part: postings.krovetz or postings.porter') args = parser.parse_args(argv) model = summaryrank.Model(args.model) if IndriIndex.is_valid_path(args.index_path): index = IndriIndex(args.index_path) print >> sys.stderr, 'use Indri index' elif GalagoIndex.is_valid_path(args.index_path): index = GalagoIndex(args.index_path, args.index_part) print >> sys.stderr, 'use Galago index' else: parser.error('must specify a valid Indri/Galago index') term_set = set() for text, _ in model.load_topics('topics_stem'): term_set.update(text.split()) for text, _ in model.load_sentences('sentences_stem'): term_set.update(text.split()) print >> sys.stderr, 'found {} stems'.format(len(term_set)) IndexDump.dump(model.get_path('freq_stats'), index, term_set)
def __init__(self, args): super(LanguageModelScore, self).__init__(args) self.mu = args.lm_mu self._freq_stats = GalagoIndex( args.index, 'postings.krovetz') if args.index else None