Beispiel #1
0
    def print_statistics(self, methods):
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        cs = CollectionStats(self.collection_path)
        performance = Performances(self.collection_path)
        res = performance.gen_optimal_performances_queries(methods, queries.keys())

        avdl = cs.get_avdl()
        total_terms = cs.get_total_terms()
        collection_freq = []
        for qid in queries:
            idx = 0
            ctf = cs.get_term_collection_occur(queries[qid])
            idf = cs.get_term_logidf1(queries[qid])
            collection_freq.append( ctf*1.0/total_terms )
        print avdl
        print np.mean(collection_freq)

        for ele in res:
            label = ele[0]
            p = ele[1]
            para = float(ele[2].split(':')[1])
            print label
            if 'okapi' in label:
                print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para)
            if 'pivoted' in label:
                print 's:', para, 'beta:', para/avdl, 'c2:', 1-para
Beispiel #2
0
 def print_best_performances(self, methods=[]):
     single_queries = Query(self.collection_path).get_queries_of_length(1)
     queries = {ele['num']:ele['title'] for ele in single_queries}
     cs = CollectionStats(self.collection_path)
     performance = Performances(self.collection_path)
     res = performance.gen_optimal_performances_queries(methods, queries.keys())
     print res
def gen_output_performances_batch():
    all_paras = []
    with open('collections.json') as cf:
        for c in json.load(cf):
            collection_name = c['collection']
            this_output_root = os.path.join(output_root, collection_name)
            if not os.path.exists(this_output_root):
                os.makedirs(this_output_root)
            index_path = os.path.join(
                index_root, 'lucene-index.' + collection_name + '.cnt.1')
            all_paras.extend(
                Performances(index_path).gen_output_performances_paras(
                    this_output_root))

    #print all_paras
    gen_batch_framework(all_paras, output_performances_atom)
def print_optimal_performances(metrics=['map']):
    # with open('g.json') as f:
    #     methods = [m['name'] for m in json.load(f)['methods']]
    # if os.path.exists('microblog_funcs.json'):
    #     with open('microblog_funcs.json') as f:
    #         methods.extend([m['name'] for m in json.load(f)['methods']])

    with open('collections.json') as cf:
        for c in json.load(cf):
            collection_name = c['collection']
            this_output_root = os.path.join(output_root, collection_name)
            index_path = os.path.join(
                index_root, 'lucene-index.' + collection_name + '.cnt.1')
            print
            print collection_name
            print '=' * 30
            Performances(index_path).print_optimal_performance(
                this_output_root, metrics)
def output_performances_atom(para):
    index_path = para[0]
    output_fn = para[1]
    input_fns = para[2:]
    Performances(index_path).output_performances(output_fn, input_fns)