コード例 #1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--year","-y",choices=list(map(int, Year)),default=0,type=int,
        help="""
            Choose the year:
                0:2015
                1:2016
                2:2011
        """)
    parser.add_argument("--tree_estimator_directory","-td",default="/infolab/node4/lukuang/2015-RTS/src/my_code/post_analysis/predictor_analysis/predictor_data/post/tree_estimator")
    parser.add_argument("--number_of_iterations","-ni",type=int,default=50)
    parser.add_argument("--error_threshold","-et",type=int,default=50)
    parser.add_argument("--expansion","-e",choices=list(map(int, Expansion)),default=0,type=int,
        help="""
            Choose the expansion:
                0:raw
                1:static:
                2:dynamic
        """)
    parser.add_argument("--retrieval_method","-rm",choices=list(map(int, RetrievalMethod)),default=0,type=int,
        help="""
            Choose the retrieval method:
                0:f2exp
                1:dirichlet
                2:pivoted
                3:bm25
        """)
    parser.add_argument("dest_file")
    args=parser.parse_args()

    # if args.error_threshold >= 50:
    #     raise ValueError("Threshold cannot be greater than 50!")

    args.year = Year(args.year)
    args.retrieval_method = RetrievalMethod(args.retrieval_method)
    args.expansion = Expansion(args.expansion)


    eval_data = EvalData(args.year)
    result_dir = R_DIR[args.year][args.expansion][args.retrieval_method]
    results = read_results(result_dir,eval_data)
    query_data_file = os.path.join(args.tree_estimator_directory,args.year.name,args.expansion.name,args.retrieval_method.name)
    query_data_file = os.path.join(query_data_file,"data")
    print "get value pair %s" %(query_data_file)
    values = json.load(open(query_data_file))

    # print results

    # create query_data
    query_data = []
    ndcgs = {}
    for qid in eval_data.days:
        for day in eval_data.days[qid]:
            day_qid = "%s_%s" %(day,qid)
            # print day_qid
            # print results[day]
            if qid in results[day]:
                day_results = {qid: results[day][qid]}
                day_query_ndcg = eval_data.ndcg(day,day_results)
            else:
                day_query_ndcg = .0
            ndcgs[day_qid] = day_query_ndcg
            
            single_data = {}
            single_data["day_qid"] = day_qid
            single_data["ndcg"] = day_query_ndcg
            single_data["values"] = values[day][qid]
            query_data.append(single_data)

    # print ndcgs

    forest = Forest(query_data,args.error_threshold,args.number_of_iterations)
    
    forest.start_training()

    predicted_values = forest.output_result(query_data)

    # print predicted_values
    kt = evaluate_kt(ndcgs,predicted_values)
    print kt
    # print "The predicted kendall's tau is %f" %(kt)

    with open(args.dest_file,'w') as f:
        cPickle.dump(forest, f, protocol=cPickle.HIGHEST_PROTOCOL)
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--tree_estimator_directory","-td",default="/infolab/node4/lukuang/2015-RTS/src/my_code/post_analysis/predictor_analysis/disk4-5/predictor_data/post/tree_estimator")
    parser.add_argument("--number_of_iterations","-ni",type=int,default=50)
    parser.add_argument("--error_threshold","-et",type=int,default=30)
    parser.add_argument("--silent_query_info_file","-sf",default="/infolab/node4/lukuang/2015-RTS/disk4-5/eval/silent_query_info")
    parser.add_argument("--retrieval_method","-rm",choices=list(map(int, RetrievalMethod)),default=0,type=int,
        help="""
            Choose the retrieval method:
                0:f2exp
                1:dirichlet
                2:pivoted
                3:bm25
        """)
    parser.add_argument("--use_auc","-ua",action="store_true")
    parser.add_argument("--metric_string","-ms",default="P_10")
    args=parser.parse_args()

    index_type = IndexType.processed
    eval_data = EvalData(index_type,args.metric_string)
    args.retrieval_method = RetrievalMethod(args.retrieval_method)
    result_dir = R_DIR[index_type][args.retrieval_method]
    print "result dir %s" %(result_dir)
    result_files = get_result_files(result_dir)
    query_data_file = os.path.join(args.tree_estimator_directory,index_type.name,args.retrieval_method.name)
    query_data_file = os.path.join(query_data_file,"data")
    print "get value pair %s" %(query_data_file)
    values = json.load(open(query_data_file))

    all_metrics = {}
    for day in values:
        all_metrics[day] =  eval_data.get_metric(result_files[day])


    silent_query_info = json.load(open(args.silent_query_info_file))

    # print all_metrics
    query_data = []
    silent_judgments = []
    silent_days = {}
    day = "10"
    silent_list = {}
    for qid in values.values()[0].keys():
        # m = re.search("^(\d+)_",qid)
        # if m:
        #     q_num = int(m.group(1))
        #     if q_num > 650:
        #         continue
        # else:
        #     raise RuntimeError("Mal qid format %s" %(qid))
        day_qid = "10_%s" %(qid)
        # print day_qid
        
        
        # print results[day]

        if qid in all_metrics[day]:
            
            day_query_metric = all_metrics[day][qid]

            m = re.search("^(\d+)_",qid)
            if m:
                q_num = m.group(1)
            else:
                raise RuntimeError("Mal qid format %s" %(qid))
            
            if q_num in silent_query_info :
                silent_days[day_qid] = 1
                silent_judgments.append(1)
            else:
                if day_query_metric == .0:
                    silent_list[q_num] = 0
                silent_judgments.append(0)
                silent_days[day_qid] = 0
        
        else: 
            day_query_metric = .0
            silent_judgments.append(1)
            silent_days[day_qid] = 1

        single_data = {}
        single_data["day_qid"] = day_qid
        single_data["metric"] = day_query_metric
        single_data["values"] = values[day][qid]
        query_data.append(single_data)
        
            

    print "There are %d queries" %(len(query_data))
    print "%d of them are silent" %(sum(silent_judgments))
    print "There are %d queries with silent list" %(len(silent_list))
 
    skf = StratifiedKFold(n_splits=10)
    eval_metrics = []
    for training_index, test_index in skf.split(query_data, silent_judgments):
        training_data = []
        testing_data = []
        metrics = {}
        # print "%d training %d testing" %(len(training_index),len(test_index))
        for i in training_index:
            training_data.append( deepcopy(query_data[i]))

        for j in test_index:
            testing_data.append( deepcopy(query_data[j]))
            day_qid = query_data[j]["day_qid"]
            metrics[day_qid] = query_data[j]["metric"]

        # print training_data
        forest = Forest(training_data,args.error_threshold,args.number_of_iterations)
        
        forest.start_training()

        predicted_values = forest.output_result(testing_data)
        y_true, y_score = make_score_prediction_lists(predicted_values,silent_days)

        if args.use_auc:
            reversed_score = []
            for i in y_score:
                reversed_score.append(-1*i)
            score = roc_auc_score(y_true, reversed_score)
            print "the auc score is %f"  %(score)
            eval_metrics.append(score)
        else:
            best_f1_score = best_f1(y_true, y_score)
            print "the best f1 score is %f" %(best_f1_score)
            eval_metrics.append(best_f1_score)


    print "Average performance: %f" %(sum(eval_metrics)/(1.0*len(eval_metrics)))
コード例 #3
0
ファイル: new_two_stage.py プロジェクト: lukuang/2016-rts
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--tree_estimator_directory","-td",default="/infolab/node4/lukuang/2015-RTS/src/my_code/post_analysis/predictor_analysis/disk4-5/predictor_data/post/tree_estimator")
    parser.add_argument("--number_of_iterations","-ni",type=int,default=50)
    parser.add_argument("--error_threshold","-et",type=int,default=30)
    parser.add_argument("--silent_query_info_file","-sf",default="/infolab/node4/lukuang/2015-RTS/disk4-5/eval/silent_query_info")
    parser.add_argument("--retrieval_method","-rm",choices=list(map(int, RetrievalMethod)),default=0,type=int,
        help="""
            Choose the retrieval method:
                0:f2exp
                1:dirichlet
                2:pivoted
                3:bm25
        """)
    parser.add_argument("--use_auc","-ua",action="store_true")
    parser.add_argument("--title_only","-to",action="store_true")
    parser.add_argument("--metric_string","-ms",default="P_10")
    parser.add_argument("tree_store_dir")
    args=parser.parse_args()

    index_type = IndexType.processed
    eval_data = EvalData(index_type,args.metric_string)
    args.retrieval_method = RetrievalMethod(args.retrieval_method)
    result_dir = R_DIR[index_type][args.retrieval_method]
    print "result dir %s" %(result_dir)
    result_files = get_result_files(result_dir)
    query_data_file = os.path.join(args.tree_estimator_directory,index_type.name,args.retrieval_method.name)
    query_data_file = os.path.join(query_data_file,"data")
    print "get value pair %s" %(query_data_file)
    values = json.load(open(query_data_file))

    all_metrics = {}
    for day in values:
        all_metrics[day] =  eval_data.get_metric(result_files[day])


    silent_query_info = json.load(open(args.silent_query_info_file))
    # print all_metrics
    title_query_data = []
    desc_query_data = []
    query_data = []
    silent_judgments = []

    silent_days = {}
    day = "10"
    for qid in values.values()[0].keys():
        # m = re.search("^(\d+)_",qid)
        # if m:
        #     q_num = int(m.group(1))
        #     if q_num > 650:
        #         continue
        # else:
        #     raise RuntimeError("Mal qid format %s" %(qid))
        day_qid = "10_%s" %(qid)
        # print day_qid
        
        
        # print results[day]
        if args.title_only:
            if "title" not in qid:
                continue
        if qid in all_metrics[day]:
            
            day_query_metric = all_metrics[day][qid]

            m = re.search("^(\d+)_",qid)
            if m:
                q_num = m.group(1)
            else:
                raise RuntimeError("Mal qid format %s" %(qid))
            
            if q_num in silent_query_info :
                silent_days[day_qid] = 1
            else:
                silent_days[day_qid] = 0
        
        else: 
            print "%s query has no metric!" %(qid)
            day_query_metric = .0
            silent_days[day_qid] = 1

        single_data = {}
        single_data["day_qid"] = day_qid
        single_data["metric"] = day_query_metric
        single_data["values"] = values[day][qid]

        if "title" in qid:
            title_query_data.append(single_data)
        else:
            desc_query_data.append(single_data)

        query_data.append(single_data)
        silent_judgments.append( silent_days[day_qid] )
        
    title_tree = load_tree(args.tree_store_dir,QueryPart.title,args.retrieval_method,args.metric_string)
    title_predicted = title_tree.output_result(title_query_data)
    if not args.title_only:
        desc_tree = load_tree(args.tree_store_dir,QueryPart.desc,args.retrieval_method,args.metric_string)
        desc_predicted = desc_tree.output_result(desc_query_data)


    # print "There are %d queries" %(len(query_data))
    # print "%d of them are silent" %(sum(silent_judgments))


    print "There are %d samples" %(len(query_data))
    # print thresholds
        
    num_of_split = 10
    f1_macro_average = .0
    f1_average = .0
    skf = StratifiedKFold(n_splits=num_of_split,shuffle=True)
    for training_index, test_index in skf.split(query_data, silent_judgments):
        all_training_data = []
        training_title_query_data = []
        training_desc_query_data = []


        # print "%d training %d testing" %(len(training_index),len(test_index))
        for i in training_index:
            single_data = deepcopy(query_data[i])
            day_qid = single_data["day_qid"]
                    
            all_training_data.append(single_data )
            if "title" in day_qid:
                training_title_query_data.append(single_data)
            else:
                if not args.title_only:
                    training_desc_query_data.append(single_data)
        
        train_title_predicted = title_tree.output_result(training_title_query_data)
        if not args.title_only:
            train_desc_predicted = desc_tree.output_result(training_desc_query_data)
        else:
            train_desc_predicted = {0:0}
        thresholds = get_threshold(train_title_predicted.values(),train_desc_predicted.values(),args.title_only)
        best_tree_threshold = {}
        best_f1_score = -1000
        best_f1_threshold = .0
        for threshold in thresholds:
            sub_training_data = []
            training_pre_y_true = []
            training_pre_y_score = []
            for single_data in all_training_data:
                day_qid = single_data["day_qid"]
                if "title" in day_qid:
                    if (title_predicted[day_qid] <= threshold["title"]):
                        
                        sub_training_data.append(single_data )
                    else:
                        training_pre_y_score.append(1000)
                        training_pre_y_true.append(silent_days[day_qid])
                else:
                    if not args.title_only:
                        if (desc_predicted[day_qid]  <= threshold["desc"]):
                            sub_training_data.append(single_data) 
                        else:
                            training_pre_y_score.append(1000)
                            training_pre_y_true.append(silent_days[day_qid])


            forest = Forest(sub_training_data,args.error_threshold,args.number_of_iterations)
            forest.start_training()

            training_predicted_values = forest.output_result(sub_training_data)
            training_y_true, training_y_score = make_score_prediction_lists(training_predicted_values,silent_days)
            training_y_true  = training_pre_y_true + training_y_true
            training_y_score  = training_pre_y_score + training_y_score
            threshold_best_f1_threshold,theshold_best_f1_score = get_best_f1_threshold(training_y_true, training_y_score)
            if theshold_best_f1_score > best_f1_score:
               best_tree_threshold =  threshold
               best_f1_score = theshold_best_f1_score
               best_f1_threshold = threshold_best_f1_threshold
        
        print "best f1 threshold:%f, best f1 %f:" %(best_f1_threshold,best_f1_score)
        print best_tree_threshold

        testing_data = []
        testing_pre_y_true = []
        testing_pre_y_score = []

        for j in test_index:
            single_data = deepcopy(query_data[j])
            day_qid = single_data["day_qid"]
                    

            if "title" in day_qid:
                if (title_predicted[day_qid] <= best_tree_threshold["title"]):
                        
                    testing_data.append(single_data )
                else:
                    testing_pre_y_score.append(1000)
                    testing_pre_y_true.append(silent_days[day_qid])
            else:
                if not args.title_only:
                    if (desc_predicted[day_qid] <= best_tree_threshold["desc"]):
                            
                        testing_data.append(single_data )
                    else:
                        testing_pre_y_score.append(1000)
                        testing_pre_y_true.append(silent_days[day_qid])

        # test_forest = Forest(testing_data,args.error_threshold,args.number_of_iterations)
        # test_forest.start_training()

        test_predicted_values = forest.output_result(testing_data)
        testing_y_true, testing_y_score = make_score_prediction_lists(test_predicted_values,silent_days)
        testing_y_true  = testing_pre_y_true + testing_y_true
        testing_y_score  = testing_pre_y_score + testing_y_score
        test_y_predict = []
        for single_score in testing_y_score:
            if single_score < best_f1_threshold:
                test_y_predict.append(1)
            else:
                test_y_predict.append(0)
        f1_macro_average += f1(testing_y_true, test_y_predict,average="macro")/(1.0*num_of_split)
        f1_average += f1(testing_y_true, test_y_predict)/(1.0*num_of_split)

    

    print "Positive f1: %f" %(f1_average)
    print "Average f1: %f" %(f1_macro_average)
    print "-"*20
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--index_type","-it",choices=list(map(int, IndexType)),default=0,type=int,
        help="""
            Choose the index type:
                0:full
                1:processed
        """)
    parser.add_argument("--query_part","-qp",choices=list(map(int, QueryPart)),default=0,type=int,
        help="""
            Choose the query part:
                0:title
                1:desc
        """)
    parser.add_argument("--tree_estimator_directory","-td",default="/infolab/node4/lukuang/2015-RTS/src/my_code/post_analysis/predictor_analysis/disk4-5/predictor_data/post/tree_estimator")
    parser.add_argument("--number_of_iterations","-ni",type=int,default=50)
    parser.add_argument("--error_threshold","-et",type=int,default=30)
    parser.add_argument("--retrieval_method","-rm",choices=list(map(int, RetrievalMethod)),default=0,type=int,
        help="""
            Choose the retrieval method:
                0:f2exp
                1:dirichlet
                2:pivoted
                3:bm25
        """)
    parser.add_argument("dest_dir")
    parser.add_argument("--metric_string","-ms",default="P_10")
    args=parser.parse_args()

    # if args.error_threshold >= 50:
    #     raise ValueError("Threshold cannot be greater than 50!")

    args.index_type = IndexType(args.index_type)
    args.query_part = QueryPart(args.query_part)
    eval_data = EvalData(args.index_type,args.metric_string)
    args.retrieval_method = RetrievalMethod(args.retrieval_method)
    result_dir = R_DIR[args.index_type][args.retrieval_method]
    print "result dir %s" %(result_dir)
    result_files = get_result_files(result_dir)
    query_data_file = os.path.join(args.tree_estimator_directory,args.index_type.name,args.retrieval_method.name)
    query_data_file = os.path.join(query_data_file,"data")
    print "get value pair %s" %(query_data_file)
    values = json.load(open(query_data_file))

    all_metrics = {}
    for day in values:
        all_metrics[day] =  eval_data.get_metric(result_files[day])

    #load silent day

    # create query_data
    query_data = []
    day = "10"
    for qid in values.values()[0].keys():
        # m = re.search("^(\d+)_",qid)
        # if m:
        #     q_num = int(m.group(1))
        #     if q_num > 650:
        #         continue
        # else:
        #     raise RuntimeError("Mal qid format %s" %(qid))
        day_qid = "10_%s" %(qid)
        if args.query_part.name not in qid:
            continue
        # print day_qid
        
        
        # print results[day]

        if qid in all_metrics[day]:
            
            day_query_metric = all_metrics[day][qid]
        else:
            print "WARNING: %s metric not found!" %(qid)
            day_query_metric = .0
        
        single_data = {}
        single_data["day_qid"] = day_qid
        single_data["metric"] = day_query_metric
        single_data["values"] = values[day][qid]
        query_data.append(single_data)

    # print metrics
    print "There are %d queries" %(len(query_data))
    kf = KFold(n_splits=4,shuffle=True)
    kt = []

    for training_index, test_index in kf.split(query_data):

        training_data = []
        testing_data = []
        metrics = {}
        # print "%d training %d testing" %(len(training_index),len(test_index))
        for i in training_index:
            training_data.append( deepcopy(query_data[i]))

        for j in test_index:
            testing_data.append( deepcopy(query_data[j]))
            day_qid = query_data[j]["day_qid"]
            metrics[day_qid] = query_data[j]["metric"]

        # print training_data
        forest = Forest(training_data,args.error_threshold,args.number_of_iterations)
        
        forest.start_training()

        predicted_values = forest.output_result(testing_data)

        # print predicted_values
        # print metrics
        single_kt = evaluate_kt(metrics,predicted_values)
        print single_kt
        # print single_kt[0]
        kt.append(single_kt[0])

    print "The average kendall's tau is %f" %(sum(kt)/(1.0*len(kt)))


    forest = Forest(query_data,args.error_threshold,args.number_of_iterations)
        
    forest.start_training()

    dest_file = os.path.join(args.dest_dir,args.query_part.name,args.retrieval_method.name+"_"+args.metric_string)

    print "Store to %s" %(dest_file)
    with open(dest_file,'w') as f:
        cPickle.dump(forest, f, protocol=cPickle.HIGHEST_PROTOCOL)