def process(self, qid, method_name, method_paras, output_fn):
        cs = CollectionStats(self.collection_path)
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        #print qids
        self.rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries([qid], 1, 'dict')
        # idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in self.rel_docs]
        # idfs.sort(key=itemgetter(1))
        
        avdl = cs.get_avdl()
        total_terms = cs.get_total_terms()
        data = {True: [], False: []} # False: non-relevant  True: relevant

        ctf = cs.get_term_collection_occur(queries[qid])
        collection_para = {
            'avdl': avdl, 
            'total_terms': total_terms,
            'ctf': ctf
        }
        for row in cs.get_qid_details(qid):
            docid = row['docid']
            total_tf = float(row['total_tf'])
            doc_len = float(row['doc_len'])
            rel_score = int(row['rel_score'])
            rel = (rel_score>=1)
            data[rel].append( {
                'docid': docid,
                'tf': total_tf, 
                'ln': doc_len
            } )
        method_para_dict = {ele.split(':')[0]:ele.split(':')[1] for ele in method_paras.split(',')}
        max_map, max_para = self.learn(qid, data, collection_para, method_name, method_para_dict)
        with open(output_fn, 'wb') as f:
            json.dump({'map':max_map, 'para':max_para, 'eta':method_para_dict['eta']}, f, indent=2)
Beispiel #2
0
 def output_data_file(self):
     cs = CollectionStats(self.collection_path)
     single_queries = Query(self.collection_path).get_queries_of_length(1)
     queries = {ele['num']:ele['title'] for ele in single_queries}
     #print qids
     with open(os.path.join(self.collection_path, 'svm_data_index_file'), 'wb') as indexf:
         for qid in queries:
             data_fn = os.path.join(self.svm_data_root, qid)
             indexf.write('%s\n' % (data_fn))
             with open(data_fn, 'wb') as f:
                 for row in cs.get_qid_details(qid):
                     docid = row['docid']
                     total_tf = float(row['total_tf'])
                     doc_len = float(row['doc_len'])
                     rel_score = int(row['rel_score'])
                     #rel = (rel_score>=1)
                     f.write('%d qid:%s 1:%f 2:%f\n' % (rel_score, qid, total_tf, doc_len))
    def gen_perfect_ranking_list(self, plotbins=True, numbins=60):
        cs = CollectionStats(self.collection_path)
        single_queries = Query(self.collection_path).get_queries_of_length(1)
        queries = {ele['num']:ele['title'] for ele in single_queries}
        #print qids
        rel_docs = Judgment(self.collection_path).get_relevant_docs_of_some_queries(queries.keys(), 1, 'dict')
        idfs = [(qid, math.log(cs.get_term_IDF1(queries[qid]))) for qid in rel_docs]
        idfs.sort(key=itemgetter(1))
        res = {}
        for qid,idf in idfs:
            x_dict = {}
            res[qid] = []
            score_mapping = {}
            maxScore = -99999999
            for row in cs.get_qid_details(qid):
                docid = row['docid']
                total_tf = float(row['total_tf'])
                doc_len = float(row['doc_len'])
                rel_score = int(row['rel_score'])
                score = math.log(total_tf+1.0)/(math.log(total_tf+1.0)+math.log(doc_len))
                #score = total_tf/(total_tf + doc_len)
                score_mapping[docid] = score
                if score > maxScore:
                    maxScore = score
                rel = (rel_score>=1)
                if score not in x_dict:
                    x_dict[score] = [0, 0, [docid, score, rel, len(rel_docs[qid])]] # [rel_docs, total_docs]
                if rel:
                    x_dict[score][0] += 1
                x_dict[score][1] += 1

            # xaxis = x_dict.keys()
            # xaxis.sort()
            # yaxis = [(x_dict[x][0]*1./x_dict[x][1], x_dict[x][2]) for x in xaxis]
            # if plotbins:
            interval = maxScore*1.0/numbins
            newxaxis = [i for i in np.arange(0, maxScore+1e-10, interval)]
            newyaxis = [[0.0, 0.0, []] for x in newxaxis]
            for x in x_dict:
                newx = int(x / interval)
                # print x_dict[x]
                newyaxis[newx][0] += x_dict[x][0]
                newyaxis[newx][1] += x_dict[x][1]
                newyaxis[newx][2].append( x_dict[x][2] )
                # print x, newx
                # print newxaxis
                # print newyaxis
                # raw_input()
            xaxis = newxaxis
            yaxis = [(ele[0]*1.0/ele[1], ele[2]) if ele[1] != 0 else (0, []) for ele in newyaxis]
            yaxis.sort(key=itemgetter(0), reverse=True)
            #yaxis.sort(key=self._sort_by_map, reverse=True)
            sbase = 1e9
            for ele in yaxis:
                for doc in ele[1]:
                    docid = doc[0]  
                    if len(res[qid]) < 1000:     
                        res[qid].append((docid, sbase+score_mapping[docid]))
                    sbase -= 100

            #print len(res[qid])

        method = 'hypothesis_stq_tf_ln_upperbound'
        self.output_results(res, method)
        self.eval(method)