예제 #1
0
def run_chosen_model_for_stats(chosen_models,
                               method,
                               qrels_file,
                               feature_file,
                               doc_name_index,
                               seo_scores,
                               base_features_file,
                               ref_index,
                               beta=""):
    chosen_model_parameter = chosen_models[method]
    svm = svm_handler()
    model_file = svm.learn_svm_rank_model(base_features_file, method,
                                          chosen_model_parameter)
    #
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file, method)

    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results,
                                      method + "_" + ref_index)
    final_trec_file = evaluator.order_trec_file(trec_file)
    increase_stats = get_average_query_rank_promotion(seo_scores,
                                                      final_trec_file)
    similarities = read_similarity_file(
        "/home/greg/auto_seo/scripts/similarities_file")
    add = ""
    if beta:
        add = "_" "_" + str(beta)
    table_name = "summary_corr_" + method + str(ref_index) + add + ".text"
    create_correlation_for_different_ranks(similarities, increase_stats,
                                           table_name, ref_index)
    return table_name
예제 #2
0
def run_chosen_model_for_stats(chosen_models,
                               method,
                               qrels_file,
                               feature_file,
                               doc_name_index,
                               seo_scores,
                               base_features_file,
                               ref_index,
                               beta=""):
    chosen_model_parameter = chosen_models[method]
    svm = svm_handler()
    model_file = svm.learn_svm_rank_model(base_features_file, method,
                                          chosen_model_parameter)
    #
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file, method)

    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results,
                                      method + "_" + ref_index)
    final_trec_file = evaluator.order_trec_file(trec_file)
    increase_stats = get_average_score_increase(seo_scores, final_trec_file)
    add = ""
    if beta:
        add = "_" + str(beta)
    summary_file = method + "_" + str(ref_index) + add + ".tex"
    evaluator.run_trec_eval_on_test(qrels_file, summary_file,
                                    method + "_" + ref_index, None,
                                    increase_stats)
    return summary_file
예제 #3
0
def choose_model(features_file, qrels_file, label_method, beta=""):
    number_of_folds = 5
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    metrics = ["map", "ndcg", "P.2", "P.5"]
    evaluator = e.eval(metrics)
    evaluator.create_index_to_doc_name_dict(features_file)
    evaluator.remove_score_file_from_last_run("svm_rank")
    folds = preprocess.create_folds(X, y, queries, number_of_folds)
    fold_number = 1
    C = [0.1, 0.01, 0.001]
    model_handler = s.svm_handler()
    evaluator.empty_validation_files("svm_rank")
    trecs = []
    for train, test in folds:
        # model_handler.set_queries_to_folds(queries, test, fold_number)
        train_file = preprocess.create_train_file(X[train], y[train],
                                                  queries[train], fold_number,
                                                  "svm_rank")
        test_file = preprocess.create_train_file(X[test], y[test],
                                                 queries[test], fold_number,
                                                 "svm_rank", True)
        for c_value in C:
            model_file = model_handler.learn_svm_rank_model(
                train_file, fold_number, c_value)
            model_name = os.path.basename(model_file).replace(".txt", "")
            scores_file = model_handler.run_svm_rank_model(
                test_file, model_file, fold_number)
            results = model_handler.retrieve_scores(test, scores_file)
            trec_file = evaluator.create_trec_eval_file(test_indices=test,
                                                        queries=queries,
                                                        results=results,
                                                        model=model_name,
                                                        method="svm_rank",
                                                        fold=0,
                                                        validation=True)
            trecs.append(trec_file)
            trecs = list(set(trecs))
        fold_number += 1
    scores = {}
    for trec_file in trecs:
        print("working on ", trec_file)
        score = evaluator.run_trec_eval(trec_file, qrels_file)
        model = os.path.basename(trec_file)
        scores[model] = score

    sorted_models = sorted(list(scores.keys()),
                           key=lambda x: scores[x],
                           reverse=True)
    for file in sorted_models:
        print(file, scores[file])
    f = open("chosen_models_" + label_method, "w")
    add = ""
    if beta:
        add = "_" + beta
    f.write(label_method + add + " " + sorted_models[0] + "\n")
    f.close()
예제 #4
0
def run_svm_model(feature_file, model_file, doc_name_index, query, ref_doc,
                  current_time):
    svm = svm_handler()
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file,
                                         query + "_" + ref_doc)
    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results,
                                      query + "_" + ref_doc, current_time,
                                      query)
    final_trec_file = evaluator.order_trec_file(trec_file)
    return final_trec_file
예제 #5
0
def cross_validation(features_file, qrels_file, summary_file, append_file=""):
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    print("there are ", number_of_queries, 'queries')
    evaluator = e.eval()
    evaluator.create_index_to_doc_name_dict(features_file)

    folds = preprocess.create_folds(X, y, queries, 5)
    fold_number = 1
    # C_array = [0.1, 0.01, 0.0001,1,10,100,10000]
    C_array = [0.1, 0.01, 0.0001]
    validated = set()
    scores = {}
    models = {}
    method = "svm_rank_own"
    s_hadelr = sv.svm_handler()
    evaluator.empty_validation_files(method)
    for train, test in folds:
        validated, validation_set, train_set = preprocess.create_validation_set(
            5, validated, set(train), number_of_queries, queries)
        print("transforming data", flush=True)
        transformed_X, transformed_y = s.RankSVM.transform_pairwise(
            X[train_set], y[train_set])
        for C in C_array:
            svm = s.RankSVM(C)
            model_file = svm.fit(transformed_X, transformed_y, fold_number, C)
            scores_file = svm.predict(X[validation_set], fold_number, C,
                                      model_file)
            results = svm.retrieve_scores(validation_set, scores_file)
            score_file = evaluator.create_trec_eval_file(
                validation_set, queries, results, str(C), method, fold_number,
                True)
            score = evaluator.run_trec_eval(score_file, qrels_file)
            scores[C] = score
            models[C] = svm
        max_C = max(scores.items(), key=operator.itemgetter(1))[0]
        print("on fold", fold_number, "chosen model:", max_C)
        chosen_model = models[max_C]
        test_scores_file = chosen_model.predict(X[test], chosen_model,
                                                fold_number)
        results = s_hadelr.retrieve_scores(test, test_scores_file)
        trec_file = evaluator.create_trec_eval_file(test, queries, results, "",
                                                    method, fold_number)

        fold_number += 1
    evaluator.order_trec_file(trec_file)
    run_bash_command("rm " + trec_file)
    evaluator.run_trec_eval_on_test(qrels_file, summary_file, method)
예제 #6
0
def run_chosen_model_for_stats(chosen_models, method, feature_file,
                               doc_name_index, base_features_file, beta):
    key = method
    if beta:
        key += "_" + beta

    chosen_model_parameter = chosen_models[key]
    svm = svm_handler()
    model_file = svm.learn_svm_rank_model(base_features_file, method,
                                          chosen_model_parameter)
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file, method)

    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results, method)
    final_trec_file = evaluator.order_trec_file(trec_file)
    return final_trec_file
예제 #7
0
def cross_validation(features_file,
                     qrels_file,
                     summary_file,
                     method,
                     metrics,
                     append_file="",
                     seo_scores=False,
                     run_random_for_significance=None):
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    print("there are ", number_of_queries, 'queries')
    evaluator = e.eval(metrics)
    evaluator.create_index_to_doc_name_dict(features_file)

    folds = preprocess.create_folds(X, y, queries, 5)
    fold_number = 1
    # C_array = [0.1, 0.01, 0.0001,1,10,100,10000]
    C_array = [0.1, 0.01, 0.0001]
    validated = set()
    scores = {}
    total_models = {}
    svm = s.svm_handler()
    evaluator.empty_validation_files(method)
    for train, test in folds:
        models = {}
        validated, validation_set, train_set = preprocess.create_validation_set(
            5, validated, set(train), number_of_queries, queries)
        train_set = sorted(list(train_set))
        validation_set = sorted(list(validation_set))
        test_set = sorted(list(test))
        train_file = preprocess.create_train_file(X[train_set], y[train_set],
                                                  queries[train_set],
                                                  fold_number, method)
        validation_file = preprocess.create_train_file(X[validation_set],
                                                       y[validation_set],
                                                       queries[validation_set],
                                                       fold_number, method,
                                                       True)
        test_file = preprocess.create_train_file_cv(X[test_set], y[test_set],
                                                    queries[test_set],
                                                    fold_number, method, True)
        # if append_file:
        #     print("appending train features")
        #     run_bash_command("cat " + append_file + " >> " + train_file)
        for C in C_array:

            model_file = svm.learn_svm_rank_model(train_file, fold_number, C)
            weights = recover_model(model_file)

            svm.w = weights
            scores_file = svm.run_svm_rank_model(validation_file, model_file,
                                                 fold_number)
            results = svm.retrieve_scores(validation_set, scores_file)
            score_file = evaluator.create_trec_eval_file(
                validation_set, queries, results, str(C), method, fold_number,
                True)
            score = evaluator.run_trec_eval(score_file, qrels_file)
            scores[C] = score
            models[C] = model_file
        max_C = max(scores.items(), key=operator.itemgetter(1))[0]
        print("on fold", fold_number, "chosen model:", max_C)
        chosen_model = models[max_C]
        total_models[fold_number] = chosen_model
        test_scores_file = svm.run_svm_rank_model(test_file, chosen_model,
                                                  fold_number)
        results = svm.retrieve_scores(test_set, test_scores_file)
        trec_file = evaluator.create_trec_eval_file(test_set, queries, results,
                                                    "", method, fold_number)
        fold_number += 1
    final_trec_file = evaluator.order_trec_file(trec_file)
    run_bash_command("rm " + trec_file)
    # sum=[]
    # for i in total_models:
    #     w = recover_model(total_models[i])
    #     print(w)
    #     if sum==[]:
    #         sum=w
    #     else:
    #         sum+=w
    #     print(sum)
    #
    # average = sum/len(total_models)
    # print(average)
    # f = open(qrels_file+"_averaged_weights.pkl","wb")
    # pickle.dump(average,f)
    # f.close()
    if seo_scores:
        increase_rank_stats, cv_firsts = get_average_score_increase(
            seo_scores, final_trec_file)
        stats, significance_data_cv = evaluator.run_trec_eval_by_query(
            qrels_file, final_trec_file)
        random_significance_data, random_firsts = run_random_for_significance(
            features_file, qrels_file, "sig_test", seo_scores=seo_scores)
        sig_signs = discover_significance_relevance(significance_data_cv,
                                                    random_significance_data)
        sig_signs = discover_significance_rank_promotior(
            cv_firsts, random_firsts, sig_signs)
    else:
        increase_rank_stats = False
        sig_signs = None

    evaluator.run_trec_eval_on_test(qrels_file, summary_file, method, None,
                                    increase_rank_stats, sig_signs)
    del X
    del y
    del queries
    return final_trec_file
예제 #8
0
def cross_validation(features_file, qrels_file, summary_file, append_file=""):
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    print("there are ", number_of_queries, 'queries')
    evaluator = e.eval()
    evaluator.create_index_to_doc_name_dict(features_file)

    folds = preprocess.create_folds(X, y, queries, 5)
    fold_number = 1
    C_array = [0.1, 0.01, 0.0001]
    # C_array = [0.1, 0.01, 0.0001,1,10,100,10000]
    validated = set()
    scores = {}
    models = {}
    method = "svm_light"
    svm = s.svm_handler()
    for train, test in folds:

        evaluator.empty_validation_files(method)
        validated, validation_set, train_set = preprocess.create_validation_set(
            5, validated, set(train), number_of_queries, queries)
        number_of_queries_in_fold = len(set(queries[train_set]))
        train_set = sorted(list(train_set))
        validation_set = sorted(list(validation_set))
        test_set = sorted(list(test))
        train_file = preprocess.create_train_file(X[train_set], y[train_set],
                                                  queries[train_set],
                                                  fold_number, method)
        validation_file = preprocess.create_train_file(X[validation_set],
                                                       y[validation_set],
                                                       queries[validation_set],
                                                       fold_number, method,
                                                       True)
        test_file = preprocess.create_train_file_cv(X[test_set], y[test_set],
                                                    queries[test_set],
                                                    fold_number, method, True)
        if append_file:
            print("appending train features")
            run_bash_command("cat " + append_file + " >> " + train_file)
        for C in C_array:

            model_file = svm.learn_svm_light_model(train_file, fold_number, C,
                                                   number_of_queries_in_fold)
            weights = recover_model(model_file)

            svm.w = weights
            scores_file = svm.run_svm_light_model(validation_file, model_file,
                                                  fold_number)
            results = svm.retrieve_scores(validation_set, scores_file)
            score_file = evaluator.create_trec_eval_file(
                validation_set, queries, results, str(C), method, fold_number,
                True)
            score = evaluator.run_trec_eval(score_file, qrels_file)
            scores[C] = score
            models[C] = model_file
        max_C = max(scores.items(), key=operator.itemgetter(1))[0]
        print("on fold", fold_number, "chosen model:", max_C)
        chosen_model = models[max_C]
        test_scores_file = svm.run_svm_light_model(test_file, chosen_model,
                                                   fold_number)
        results = svm.retrieve_scores(test_set, test_scores_file)
        trec_file = evaluator.create_trec_eval_file(test_set, queries, results,
                                                    "", method, fold_number)
        fold_number += 1
    evaluator.order_trec_file(trec_file)
    run_bash_command("rm " + trec_file)
    evaluator.run_trec_eval_on_test(qrels_file, summary_file, method)