Ejemplo n.º 1
0
def run_chosen_model_for_stats(chosen_models,
                               method,
                               qrels_file,
                               feature_file,
                               doc_name_index,
                               seo_scores,
                               base_features_file,
                               ref_index,
                               beta=""):
    chosen_model_parameter = chosen_models[method]
    svm = svm_handler()
    model_file = svm.learn_svm_rank_model(base_features_file, method,
                                          chosen_model_parameter)
    #
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file, method)

    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results,
                                      method + "_" + ref_index)
    final_trec_file = evaluator.order_trec_file(trec_file)
    increase_stats = get_average_query_rank_promotion(seo_scores,
                                                      final_trec_file)
    similarities = read_similarity_file(
        "/home/greg/auto_seo/scripts/similarities_file")
    add = ""
    if beta:
        add = "_" "_" + str(beta)
    table_name = "summary_corr_" + method + str(ref_index) + add + ".text"
    create_correlation_for_different_ranks(similarities, increase_stats,
                                           table_name, ref_index)
    return table_name
Ejemplo n.º 2
0
def run_chosen_model_for_stats(chosen_models,
                               method,
                               qrels_file,
                               feature_file,
                               doc_name_index,
                               seo_scores,
                               base_features_file,
                               ref_index,
                               beta=""):
    chosen_model_parameter = chosen_models[method]
    svm = svm_handler()
    model_file = svm.learn_svm_rank_model(base_features_file, method,
                                          chosen_model_parameter)
    #
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file, method)

    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results,
                                      method + "_" + ref_index)
    final_trec_file = evaluator.order_trec_file(trec_file)
    increase_stats = get_average_score_increase(seo_scores, final_trec_file)
    add = ""
    if beta:
        add = "_" + str(beta)
    summary_file = method + "_" + str(ref_index) + add + ".tex"
    evaluator.run_trec_eval_on_test(qrels_file, summary_file,
                                    method + "_" + ref_index, None,
                                    increase_stats)
    return summary_file
Ejemplo n.º 3
0
def crossvalidation(folds_folder, number_of_folds, combination_name_indexes,
                    qrels, summary_file):

    torch.multiprocessing.set_start_method("spawn")

    lrs = [0.01, 0.001]
    batch_sizes = [3]
    epochs = [5, 10, 17]
    # epochs = [1]
    momentums = [0.9]
    # dropouts = [0.2,0.5]
    scores = {}
    models = {}
    evaluator = eval(metrics=["map", "ndcg", "P.2", "P.5"])
    test_trec_file = "NN_test_trec_file.txt"
    for fold in range(1, number_of_folds + 1):
        print("in fold:", fold)
        models[fold] = {}
        scores[fold] = {}
        training_folder = folds_folder + str(fold) + "/train/"
        validation_folder = folds_folder + str(fold) + "/validation/"
        test_folder = folds_folder + str(fold) + "/test/"
        validation_results_folder = folds_folder + str(
            fold) + "/validation_results/"
        if not os.path.exists(validation_results_folder):
            os.makedirs(validation_results_folder)
        current_labels_file = "labels_fold_" + str(fold) + ".pkl"
        for lr in lrs:
            for epoch in epochs:
                for momentum in momentums:
                    for batch_size in batch_sizes:
                        model_name = "_".join((str(lr), str(epoch),
                                               str(momentum), str(batch_size)))
                        model, model_file = train_model(
                            lr, momentum, current_labels_file, training_folder,
                            batch_size, epoch, fold)
                        results = predict_folder_content(
                            validation_folder, model)
                        trec_file_name = validation_results_folder + "NN_" + model_name + ".txt"
                        evaluator.create_trec_eval_file_nn(
                            results, combination_name_indexes["val"][fold],
                            trec_file_name)
                        score = evaluator.run_trec_eval(trec_file_name, qrels)
                        scores[fold][model_name] = float(score)
                        models[fold][model_name] = model_file
        best_model = max(scores[fold].items(), key=operator.itemgetter(1))[0]
        print("chosen model on fold", fold, ":", best_model)
        test_model = torch.load(models[fold][best_model])
        results = predict_folder_content(test_folder, test_model)
        evaluator.create_trec_eval_file_nn(
            results, combination_name_indexes["test"][fold], test_trec_file,
            True)
    final_trec_file = evaluator.order_trec_file(test_trec_file)
    run_bash_command("rm " + test_trec_file)
    evaluator.run_trec_eval_on_test(summary_file=summary_file,
                                    qrels=qrels,
                                    method="NN",
                                    trec_file=final_trec_file)
Ejemplo n.º 4
0
def choose_model(features_file, qrels_file, label_method, beta=""):
    number_of_folds = 5
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    metrics = ["map", "ndcg", "P.2", "P.5"]
    evaluator = e.eval(metrics)
    evaluator.create_index_to_doc_name_dict(features_file)
    evaluator.remove_score_file_from_last_run("svm_rank")
    folds = preprocess.create_folds(X, y, queries, number_of_folds)
    fold_number = 1
    C = [0.1, 0.01, 0.001]
    model_handler = s.svm_handler()
    evaluator.empty_validation_files("svm_rank")
    trecs = []
    for train, test in folds:
        # model_handler.set_queries_to_folds(queries, test, fold_number)
        train_file = preprocess.create_train_file(X[train], y[train],
                                                  queries[train], fold_number,
                                                  "svm_rank")
        test_file = preprocess.create_train_file(X[test], y[test],
                                                 queries[test], fold_number,
                                                 "svm_rank", True)
        for c_value in C:
            model_file = model_handler.learn_svm_rank_model(
                train_file, fold_number, c_value)
            model_name = os.path.basename(model_file).replace(".txt", "")
            scores_file = model_handler.run_svm_rank_model(
                test_file, model_file, fold_number)
            results = model_handler.retrieve_scores(test, scores_file)
            trec_file = evaluator.create_trec_eval_file(test_indices=test,
                                                        queries=queries,
                                                        results=results,
                                                        model=model_name,
                                                        method="svm_rank",
                                                        fold=0,
                                                        validation=True)
            trecs.append(trec_file)
            trecs = list(set(trecs))
        fold_number += 1
    scores = {}
    for trec_file in trecs:
        print("working on ", trec_file)
        score = evaluator.run_trec_eval(trec_file, qrels_file)
        model = os.path.basename(trec_file)
        scores[model] = score

    sorted_models = sorted(list(scores.keys()),
                           key=lambda x: scores[x],
                           reverse=True)
    for file in sorted_models:
        print(file, scores[file])
    f = open("chosen_models_" + label_method, "w")
    add = ""
    if beta:
        add = "_" + beta
    f.write(label_method + add + " " + sorted_models[0] + "\n")
    f.close()
Ejemplo n.º 5
0
def analyze_significance(qrels,score_file1,score_file2):
    evaluator = eval()
    score_data1 = evaluator.run_trec_eval_by_query(qrels,score_file1)
    score_data2 = evaluator.run_trec_eval_by_query(qrels,score_file2)
    for metric in score_data1:
        x = score_data1[metric]
        y = score_data2[metric]
        ttest_val = ttest_rel(x,y)
        print("metric =",metric,ttest_val)
Ejemplo n.º 6
0
def run_svm_model(feature_file, model_file, doc_name_index, query, ref_doc,
                  current_time):
    svm = svm_handler()
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file,
                                         query + "_" + ref_doc)
    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results,
                                      query + "_" + ref_doc, current_time,
                                      query)
    final_trec_file = evaluator.order_trec_file(trec_file)
    return final_trec_file
Ejemplo n.º 7
0
def cross_validation(features_file, qrels_file, summary_file, append_file=""):
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    print("there are ", number_of_queries, 'queries')
    evaluator = e.eval()
    evaluator.create_index_to_doc_name_dict(features_file)

    folds = preprocess.create_folds(X, y, queries, 5)
    fold_number = 1
    # C_array = [0.1, 0.01, 0.0001,1,10,100,10000]
    C_array = [0.1, 0.01, 0.0001]
    validated = set()
    scores = {}
    models = {}
    method = "svm_rank_own"
    s_hadelr = sv.svm_handler()
    evaluator.empty_validation_files(method)
    for train, test in folds:
        validated, validation_set, train_set = preprocess.create_validation_set(
            5, validated, set(train), number_of_queries, queries)
        print("transforming data", flush=True)
        transformed_X, transformed_y = s.RankSVM.transform_pairwise(
            X[train_set], y[train_set])
        for C in C_array:
            svm = s.RankSVM(C)
            model_file = svm.fit(transformed_X, transformed_y, fold_number, C)
            scores_file = svm.predict(X[validation_set], fold_number, C,
                                      model_file)
            results = svm.retrieve_scores(validation_set, scores_file)
            score_file = evaluator.create_trec_eval_file(
                validation_set, queries, results, str(C), method, fold_number,
                True)
            score = evaluator.run_trec_eval(score_file, qrels_file)
            scores[C] = score
            models[C] = svm
        max_C = max(scores.items(), key=operator.itemgetter(1))[0]
        print("on fold", fold_number, "chosen model:", max_C)
        chosen_model = models[max_C]
        test_scores_file = chosen_model.predict(X[test], chosen_model,
                                                fold_number)
        results = s_hadelr.retrieve_scores(test, test_scores_file)
        trec_file = evaluator.create_trec_eval_file(test, queries, results, "",
                                                    method, fold_number)

        fold_number += 1
    evaluator.order_trec_file(trec_file)
    run_bash_command("rm " + trec_file)
    evaluator.run_trec_eval_on_test(qrels_file, summary_file, method)
Ejemplo n.º 8
0
def run_chosen_model_for_stats(chosen_models, method, feature_file,
                               doc_name_index, base_features_file, beta):
    key = method
    if beta:
        key += "_" + beta

    chosen_model_parameter = chosen_models[key]
    svm = svm_handler()
    model_file = svm.learn_svm_rank_model(base_features_file, method,
                                          chosen_model_parameter)
    evaluator = eval(["map", "ndcg", "P.2", "P.5"])
    scores_file = svm.run_svm_rank_model(feature_file, model_file, method)

    results = retrieve_scores(scores_file)
    trec_file = create_trec_eval_file(doc_name_index, results, method)
    final_trec_file = evaluator.order_trec_file(trec_file)
    return final_trec_file
Ejemplo n.º 9
0
def cross_validation(features_file,
                     qrels_file,
                     summary_file,
                     method,
                     metrics,
                     append_file="",
                     seo_scores=False,
                     run_random_for_significance=None):
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    print("there are ", number_of_queries, 'queries')
    evaluator = e.eval(metrics)
    evaluator.create_index_to_doc_name_dict(features_file)

    folds = preprocess.create_folds(X, y, queries, 5)
    fold_number = 1
    # C_array = [0.1, 0.01, 0.0001,1,10,100,10000]
    C_array = [0.1, 0.01, 0.0001]
    validated = set()
    scores = {}
    total_models = {}
    svm = s.svm_handler()
    evaluator.empty_validation_files(method)
    for train, test in folds:
        models = {}
        validated, validation_set, train_set = preprocess.create_validation_set(
            5, validated, set(train), number_of_queries, queries)
        train_set = sorted(list(train_set))
        validation_set = sorted(list(validation_set))
        test_set = sorted(list(test))
        train_file = preprocess.create_train_file(X[train_set], y[train_set],
                                                  queries[train_set],
                                                  fold_number, method)
        validation_file = preprocess.create_train_file(X[validation_set],
                                                       y[validation_set],
                                                       queries[validation_set],
                                                       fold_number, method,
                                                       True)
        test_file = preprocess.create_train_file_cv(X[test_set], y[test_set],
                                                    queries[test_set],
                                                    fold_number, method, True)
        # if append_file:
        #     print("appending train features")
        #     run_bash_command("cat " + append_file + " >> " + train_file)
        for C in C_array:

            model_file = svm.learn_svm_rank_model(train_file, fold_number, C)
            weights = recover_model(model_file)

            svm.w = weights
            scores_file = svm.run_svm_rank_model(validation_file, model_file,
                                                 fold_number)
            results = svm.retrieve_scores(validation_set, scores_file)
            score_file = evaluator.create_trec_eval_file(
                validation_set, queries, results, str(C), method, fold_number,
                True)
            score = evaluator.run_trec_eval(score_file, qrels_file)
            scores[C] = score
            models[C] = model_file
        max_C = max(scores.items(), key=operator.itemgetter(1))[0]
        print("on fold", fold_number, "chosen model:", max_C)
        chosen_model = models[max_C]
        total_models[fold_number] = chosen_model
        test_scores_file = svm.run_svm_rank_model(test_file, chosen_model,
                                                  fold_number)
        results = svm.retrieve_scores(test_set, test_scores_file)
        trec_file = evaluator.create_trec_eval_file(test_set, queries, results,
                                                    "", method, fold_number)
        fold_number += 1
    final_trec_file = evaluator.order_trec_file(trec_file)
    run_bash_command("rm " + trec_file)
    # sum=[]
    # for i in total_models:
    #     w = recover_model(total_models[i])
    #     print(w)
    #     if sum==[]:
    #         sum=w
    #     else:
    #         sum+=w
    #     print(sum)
    #
    # average = sum/len(total_models)
    # print(average)
    # f = open(qrels_file+"_averaged_weights.pkl","wb")
    # pickle.dump(average,f)
    # f.close()
    if seo_scores:
        increase_rank_stats, cv_firsts = get_average_score_increase(
            seo_scores, final_trec_file)
        stats, significance_data_cv = evaluator.run_trec_eval_by_query(
            qrels_file, final_trec_file)
        random_significance_data, random_firsts = run_random_for_significance(
            features_file, qrels_file, "sig_test", seo_scores=seo_scores)
        sig_signs = discover_significance_relevance(significance_data_cv,
                                                    random_significance_data)
        sig_signs = discover_significance_rank_promotior(
            cv_firsts, random_firsts, sig_signs)
    else:
        increase_rank_stats = False
        sig_signs = None

    evaluator.run_trec_eval_on_test(qrels_file, summary_file, method, None,
                                    increase_rank_stats, sig_signs)
    del X
    del y
    del queries
    return final_trec_file
Ejemplo n.º 10
0
from CrossValidationUtils.evaluator import eval
trees = [500, 250]
leaves = [5, 10, 25, 50]
evaluator = eval(metrics=[""])
qrels = "/home/greg/auto_seo/CrossValidationUtils/mq_track_qrels"
base_folder = "/home/greg/auto_seo/CrossValidationUtils/lm_validation/0/"
for tree in trees:
    for leaf in leaves:
        file_name = base_folder + "trec_file_model_" + str(tree) + "_" + str(
            leaf) + ".txt"
        tmp_file = file_name + "_tmp"
        f = open(tmp_file, "w")
        with open(file_name) as file:
            for line in file:
                new_line_splits = line.split()
                new_line = " ".join([
                    new_line_splits[0], new_line_splits[1], new_line_splits[2],
                    new_line_splits[3], new_line_splits[6], new_line_splits[7]
                ]) + '\n'
                f.write(new_line)
        f.close()
        print("on ", tmp_file)
        final_trec_score = evaluator.run_trec_eval(tmp_file, qrels)
Ejemplo n.º 11
0
    qrels_file = sys.argv[2]
    print("qrels file=", qrels_file)
    if len(sys.argv) < 4:
        summary_file = "summary_lm.tex"
    else:
        summary_file = sys.argv[3]
    if len(sys.argv) < 5:
        append_features = ""
    else:
        append_features = sys.argv[4]
    number_of_folds = 5
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    metrics = ["map", "ndcg_cut.20", "P.10", "P.5"]
    evaluator = e.eval(metrics)
    evaluator.create_index_to_doc_name_dict(features_file)
    evaluator.remove_score_file_from_last_run("lm")

    folds = preprocess.create_folds(X, y, queries, number_of_folds)
    fold_number = 1
    trees = [250, 500]
    # trees = [250,]
    leaves = [5, 10, 25, 50]
    # leaves=[5,]
    model_handler = mh.model_handler_LambdaMart(trees, leaves)
    validated = set()
    evaluator.empty_validation_files("lm")
    for train, test in folds:
        validated, validation_set, train_set = preprocess.create_validation_set(
            number_of_folds, validated, set(train), number_of_queries, queries)
Ejemplo n.º 12
0
def cross_validation(features_file, qrels_file, summary_file, append_file=""):
    preprocess = p.preprocess()
    X, y, queries = preprocess.retrieve_data_from_file(features_file, True)
    number_of_queries = len(set(queries))
    print("there are ", number_of_queries, 'queries')
    evaluator = e.eval()
    evaluator.create_index_to_doc_name_dict(features_file)

    folds = preprocess.create_folds(X, y, queries, 5)
    fold_number = 1
    C_array = [0.1, 0.01, 0.0001]
    # C_array = [0.1, 0.01, 0.0001,1,10,100,10000]
    validated = set()
    scores = {}
    models = {}
    method = "svm_light"
    svm = s.svm_handler()
    for train, test in folds:

        evaluator.empty_validation_files(method)
        validated, validation_set, train_set = preprocess.create_validation_set(
            5, validated, set(train), number_of_queries, queries)
        number_of_queries_in_fold = len(set(queries[train_set]))
        train_set = sorted(list(train_set))
        validation_set = sorted(list(validation_set))
        test_set = sorted(list(test))
        train_file = preprocess.create_train_file(X[train_set], y[train_set],
                                                  queries[train_set],
                                                  fold_number, method)
        validation_file = preprocess.create_train_file(X[validation_set],
                                                       y[validation_set],
                                                       queries[validation_set],
                                                       fold_number, method,
                                                       True)
        test_file = preprocess.create_train_file_cv(X[test_set], y[test_set],
                                                    queries[test_set],
                                                    fold_number, method, True)
        if append_file:
            print("appending train features")
            run_bash_command("cat " + append_file + " >> " + train_file)
        for C in C_array:

            model_file = svm.learn_svm_light_model(train_file, fold_number, C,
                                                   number_of_queries_in_fold)
            weights = recover_model(model_file)

            svm.w = weights
            scores_file = svm.run_svm_light_model(validation_file, model_file,
                                                  fold_number)
            results = svm.retrieve_scores(validation_set, scores_file)
            score_file = evaluator.create_trec_eval_file(
                validation_set, queries, results, str(C), method, fold_number,
                True)
            score = evaluator.run_trec_eval(score_file, qrels_file)
            scores[C] = score
            models[C] = model_file
        max_C = max(scores.items(), key=operator.itemgetter(1))[0]
        print("on fold", fold_number, "chosen model:", max_C)
        chosen_model = models[max_C]
        test_scores_file = svm.run_svm_light_model(test_file, chosen_model,
                                                   fold_number)
        results = svm.retrieve_scores(test_set, test_scores_file)
        trec_file = evaluator.create_trec_eval_file(test_set, queries, results,
                                                    "", method, fold_number)
        fold_number += 1
    evaluator.order_trec_file(trec_file)
    run_bash_command("rm " + trec_file)
    evaluator.run_trec_eval_on_test(qrels_file, summary_file, method)