Ejemplo n.º 1
0
 def gen_ranking_list(self, method, _callback, paras):
     """
     We get the statistics from /collection_path/detailed_doc_stats/ 
     so that we can get everything for the top 10,000 documents for 
     each query generated by Dirichlet language model method.
     """
     single_queries = Query(self.collection_path).get_queries_of_length(1)
     queries = {ele['num']:ele['title'] for ele in single_queries}
     doc_details = GenSqaDocDetails(self.collection_path)
     cs = CollectionStats(self.collection_path)
     avdl = cs.get_avdl()
     total_terms = cs.get_total_terms()
     res = {}
     for qid in queries:
         print queries[qid]
         res[qid] = []
         idx = 0
         ctf = cs.get_term_collection_occur(queries[qid])
         idf = cs.get_term_logidf1(queries[qid])
         #for row in cs.get_qid_details(qid):
         for row in doc_details.get_qid_details(qid):
             docid = row['docid']
             total_tf = float(row['total_tf'])
             doc_len = float(row['doc_len'])
             localpara = copy.deepcopy(paras)
             localpara.extend([total_tf, doc_len, avdl, ctf, total_terms, idf])
             score = _callback(localpara)
             res[qid].append((docid, score))
             idx += 1
             if idx >= 1000:
                 break
     self.output_results(res, method)
     self.eval(method)
Ejemplo n.º 2
0
    def print_statistics(self, methods):
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        cs = CollectionStats(self.collection_path)
        performance = Performances(self.collection_path)
        res = performance.gen_optimal_performances_queries(methods, queries.keys())

        avdl = cs.get_avdl()
        total_terms = cs.get_total_terms()
        collection_freq = []
        for qid in queries:
            idx = 0
            ctf = cs.get_term_collection_occur(queries[qid])
            idf = cs.get_term_logidf1(queries[qid])
            collection_freq.append( ctf*1.0/total_terms )
        print avdl
        print np.mean(collection_freq)

        for ele in res:
            label = ele[0]
            p = ele[1]
            para = float(ele[2].split(':')[1])
            print label
            if 'okapi' in label:
                print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para)
            if 'pivoted' in label:
                print 's:', para, 'beta:', para/avdl, 'c2:', 1-para