Esempio n. 1
0
 def __init__(self, args):
     super(BM25Score, self).__init__(args)
     self.k1 = args.bm25_k1
     self.b = args.bm25_b
     self.avgdl = args.bm25_avgdl
     self._freq_stats = GalagoIndex(
         args.index, 'postings.krovetz') if args.index else None
Esempio n. 2
0
def gen_freqstats(argv):
    """ Generate frequency stats """
    parser = argparse.ArgumentParser(
        prog='gen_freqstats',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        add_help=False,
    )

    parser.add_argument('-m',
                        dest='model',
                        metavar='DIR',
                        required=True,
                        help='store the processed data in DIR')
    parser.add_argument('index_path', help='path to Indri/Galago index')
    parser.add_argument(
        'index_part',
        nargs='?',
        help='(Galago only) index part: postings.krovetz or postings.porter')
    args = parser.parse_args(argv)

    model = summaryrank.Model(args.model)

    if IndriIndex.is_valid_path(args.index_path):
        index = IndriIndex(args.index_path)
        print >> sys.stderr, 'use Indri index'
    elif GalagoIndex.is_valid_path(args.index_path):
        index = GalagoIndex(args.index_path, args.index_part)
        print >> sys.stderr, 'use Galago index'
    else:
        parser.error('must specify a valid Indri/Galago index')

    term_set = set()
    for text, _ in model.load_topics('topics_stem'):
        term_set.update(text.split())
    for text, _ in model.load_sentences('sentences_stem'):
        term_set.update(text.split())

    print >> sys.stderr, 'found {} stems'.format(len(term_set))

    IndexDump.dump(model.get_path('freq_stats'), index, term_set)
Esempio n. 3
0
 def __init__(self, args):
     super(LanguageModelScore, self).__init__(args)
     self.mu = args.lm_mu
     self._freq_stats = GalagoIndex(
         args.index, 'postings.krovetz') if args.index else None