Example #1
0
    def print_statistics(self, methods):
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        cs = CollectionStats(self.collection_path)
        performance = Performances(self.collection_path)
        res = performance.gen_optimal_performances_queries(methods, queries.keys())

        avdl = cs.get_avdl()
        total_terms = cs.get_total_terms()
        collection_freq = []
        for qid in queries:
            idx = 0
            ctf = cs.get_term_collection_occur(queries[qid])
            idf = cs.get_term_logidf1(queries[qid])
            collection_freq.append( ctf*1.0/total_terms )
        print avdl
        print np.mean(collection_freq)

        for ele in res:
            label = ele[0]
            p = ele[1]
            para = float(ele[2].split(':')[1])
            print label
            if 'okapi' in label:
                print 'b:', para, 'beta:', 1.2*para/avdl, 'c2:', 1.2*(1-para)
            if 'pivoted' in label:
                print 's:', para, 'beta:', para/avdl, 'c2:', 1-para
 def gen_ranking_list(self, method, _callback, paras):
     """
     We get the statistics from /collection_path/detailed_doc_stats/ 
     so that we can get everything for the top 10,000 documents for 
     each query generated by Dirichlet language model method.
     """
     single_queries = Query(self.collection_path).get_queries_of_length(1)
     queries = {ele['num']:ele['title'] for ele in single_queries}
     doc_details = GenSqaDocDetails(self.collection_path)
     cs = CollectionStats(self.collection_path)
     avdl = cs.get_avdl()
     total_terms = cs.get_total_terms()
     res = {}
     for qid in queries:
         print queries[qid]
         res[qid] = []
         idx = 0
         ctf = cs.get_term_collection_occur(queries[qid])
         idf = cs.get_term_logidf1(queries[qid])
         #for row in cs.get_qid_details(qid):
         for row in doc_details.get_qid_details(qid):
             docid = row['docid']
             total_tf = float(row['total_tf'])
             doc_len = float(row['doc_len'])
             localpara = copy.deepcopy(paras)
             localpara.extend([total_tf, doc_len, avdl, ctf, total_terms, idf])
             score = _callback(localpara)
             res[qid].append((docid, score))
             idx += 1
             if idx >= 1000:
                 break
     self.output_results(res, method)
     self.eval(method)
    def process(self, qid, method_name, method_paras, output_fn):
        cs = CollectionStats(self.collection_path)
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        #print qids
        self.rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries([qid], 1, 'dict')
        # idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in self.rel_docs]
        # idfs.sort(key=itemgetter(1))
        
        avdl = cs.get_avdl()
        total_terms = cs.get_total_terms()
        data = {True: [], False: []} # False: non-relevant  True: relevant

        ctf = cs.get_term_collection_occur(queries[qid])
        collection_para = {
            'avdl': avdl, 
            'total_terms': total_terms,
            'ctf': ctf
        }
        for row in cs.get_qid_details(qid):
            docid = row['docid']
            total_tf = float(row['total_tf'])
            doc_len = float(row['doc_len'])
            rel_score = int(row['rel_score'])
            rel = (rel_score>=1)
            data[rel].append( {
                'docid': docid,
                'tf': total_tf, 
                'ln': doc_len
            } )
        method_para_dict = {ele.split(':')[0]:ele.split(':')[1] for ele in method_paras.split(',')}
        max_map, max_para = self.learn(qid, data, collection_para, method_name, method_para_dict)
        with open(output_fn, 'wb') as f:
            json.dump({'map':max_map, 'para':max_para, 'eta':method_para_dict['eta']}, f, indent=2)