def run_chosen_model_for_stats(chosen_models, method, qrels_file, feature_file, doc_name_index, seo_scores, base_features_file, ref_index, beta=""): chosen_model_parameter = chosen_models[method] svm = svm_handler() model_file = svm.learn_svm_rank_model(base_features_file, method, chosen_model_parameter) # evaluator = eval(["map", "ndcg", "P.2", "P.5"]) scores_file = svm.run_svm_rank_model(feature_file, model_file, method) results = retrieve_scores(scores_file) trec_file = create_trec_eval_file(doc_name_index, results, method + "_" + ref_index) final_trec_file = evaluator.order_trec_file(trec_file) increase_stats = get_average_query_rank_promotion(seo_scores, final_trec_file) similarities = read_similarity_file( "/home/greg/auto_seo/scripts/similarities_file") add = "" if beta: add = "_" "_" + str(beta) table_name = "summary_corr_" + method + str(ref_index) + add + ".text" create_correlation_for_different_ranks(similarities, increase_stats, table_name, ref_index) return table_name
def run_chosen_model_for_stats(chosen_models, method, qrels_file, feature_file, doc_name_index, seo_scores, base_features_file, ref_index, beta=""): chosen_model_parameter = chosen_models[method] svm = svm_handler() model_file = svm.learn_svm_rank_model(base_features_file, method, chosen_model_parameter) # evaluator = eval(["map", "ndcg", "P.2", "P.5"]) scores_file = svm.run_svm_rank_model(feature_file, model_file, method) results = retrieve_scores(scores_file) trec_file = create_trec_eval_file(doc_name_index, results, method + "_" + ref_index) final_trec_file = evaluator.order_trec_file(trec_file) increase_stats = get_average_score_increase(seo_scores, final_trec_file) add = "" if beta: add = "_" + str(beta) summary_file = method + "_" + str(ref_index) + add + ".tex" evaluator.run_trec_eval_on_test(qrels_file, summary_file, method + "_" + ref_index, None, increase_stats) return summary_file
def choose_model(features_file, qrels_file, label_method, beta=""): number_of_folds = 5 preprocess = p.preprocess() X, y, queries = preprocess.retrieve_data_from_file(features_file, True) number_of_queries = len(set(queries)) metrics = ["map", "ndcg", "P.2", "P.5"] evaluator = e.eval(metrics) evaluator.create_index_to_doc_name_dict(features_file) evaluator.remove_score_file_from_last_run("svm_rank") folds = preprocess.create_folds(X, y, queries, number_of_folds) fold_number = 1 C = [0.1, 0.01, 0.001] model_handler = s.svm_handler() evaluator.empty_validation_files("svm_rank") trecs = [] for train, test in folds: # model_handler.set_queries_to_folds(queries, test, fold_number) train_file = preprocess.create_train_file(X[train], y[train], queries[train], fold_number, "svm_rank") test_file = preprocess.create_train_file(X[test], y[test], queries[test], fold_number, "svm_rank", True) for c_value in C: model_file = model_handler.learn_svm_rank_model( train_file, fold_number, c_value) model_name = os.path.basename(model_file).replace(".txt", "") scores_file = model_handler.run_svm_rank_model( test_file, model_file, fold_number) results = model_handler.retrieve_scores(test, scores_file) trec_file = evaluator.create_trec_eval_file(test_indices=test, queries=queries, results=results, model=model_name, method="svm_rank", fold=0, validation=True) trecs.append(trec_file) trecs = list(set(trecs)) fold_number += 1 scores = {} for trec_file in trecs: print("working on ", trec_file) score = evaluator.run_trec_eval(trec_file, qrels_file) model = os.path.basename(trec_file) scores[model] = score sorted_models = sorted(list(scores.keys()), key=lambda x: scores[x], reverse=True) for file in sorted_models: print(file, scores[file]) f = open("chosen_models_" + label_method, "w") add = "" if beta: add = "_" + beta f.write(label_method + add + " " + sorted_models[0] + "\n") f.close()
def run_svm_model(feature_file, model_file, doc_name_index, query, ref_doc, current_time): svm = svm_handler() evaluator = eval(["map", "ndcg", "P.2", "P.5"]) scores_file = svm.run_svm_rank_model(feature_file, model_file, query + "_" + ref_doc) results = retrieve_scores(scores_file) trec_file = create_trec_eval_file(doc_name_index, results, query + "_" + ref_doc, current_time, query) final_trec_file = evaluator.order_trec_file(trec_file) return final_trec_file
def cross_validation(features_file, qrels_file, summary_file, append_file=""): preprocess = p.preprocess() X, y, queries = preprocess.retrieve_data_from_file(features_file, True) number_of_queries = len(set(queries)) print("there are ", number_of_queries, 'queries') evaluator = e.eval() evaluator.create_index_to_doc_name_dict(features_file) folds = preprocess.create_folds(X, y, queries, 5) fold_number = 1 # C_array = [0.1, 0.01, 0.0001,1,10,100,10000] C_array = [0.1, 0.01, 0.0001] validated = set() scores = {} models = {} method = "svm_rank_own" s_hadelr = sv.svm_handler() evaluator.empty_validation_files(method) for train, test in folds: validated, validation_set, train_set = preprocess.create_validation_set( 5, validated, set(train), number_of_queries, queries) print("transforming data", flush=True) transformed_X, transformed_y = s.RankSVM.transform_pairwise( X[train_set], y[train_set]) for C in C_array: svm = s.RankSVM(C) model_file = svm.fit(transformed_X, transformed_y, fold_number, C) scores_file = svm.predict(X[validation_set], fold_number, C, model_file) results = svm.retrieve_scores(validation_set, scores_file) score_file = evaluator.create_trec_eval_file( validation_set, queries, results, str(C), method, fold_number, True) score = evaluator.run_trec_eval(score_file, qrels_file) scores[C] = score models[C] = svm max_C = max(scores.items(), key=operator.itemgetter(1))[0] print("on fold", fold_number, "chosen model:", max_C) chosen_model = models[max_C] test_scores_file = chosen_model.predict(X[test], chosen_model, fold_number) results = s_hadelr.retrieve_scores(test, test_scores_file) trec_file = evaluator.create_trec_eval_file(test, queries, results, "", method, fold_number) fold_number += 1 evaluator.order_trec_file(trec_file) run_bash_command("rm " + trec_file) evaluator.run_trec_eval_on_test(qrels_file, summary_file, method)
def run_chosen_model_for_stats(chosen_models, method, feature_file, doc_name_index, base_features_file, beta): key = method if beta: key += "_" + beta chosen_model_parameter = chosen_models[key] svm = svm_handler() model_file = svm.learn_svm_rank_model(base_features_file, method, chosen_model_parameter) evaluator = eval(["map", "ndcg", "P.2", "P.5"]) scores_file = svm.run_svm_rank_model(feature_file, model_file, method) results = retrieve_scores(scores_file) trec_file = create_trec_eval_file(doc_name_index, results, method) final_trec_file = evaluator.order_trec_file(trec_file) return final_trec_file
def cross_validation(features_file, qrels_file, summary_file, method, metrics, append_file="", seo_scores=False, run_random_for_significance=None): preprocess = p.preprocess() X, y, queries = preprocess.retrieve_data_from_file(features_file, True) number_of_queries = len(set(queries)) print("there are ", number_of_queries, 'queries') evaluator = e.eval(metrics) evaluator.create_index_to_doc_name_dict(features_file) folds = preprocess.create_folds(X, y, queries, 5) fold_number = 1 # C_array = [0.1, 0.01, 0.0001,1,10,100,10000] C_array = [0.1, 0.01, 0.0001] validated = set() scores = {} total_models = {} svm = s.svm_handler() evaluator.empty_validation_files(method) for train, test in folds: models = {} validated, validation_set, train_set = preprocess.create_validation_set( 5, validated, set(train), number_of_queries, queries) train_set = sorted(list(train_set)) validation_set = sorted(list(validation_set)) test_set = sorted(list(test)) train_file = preprocess.create_train_file(X[train_set], y[train_set], queries[train_set], fold_number, method) validation_file = preprocess.create_train_file(X[validation_set], y[validation_set], queries[validation_set], fold_number, method, True) test_file = preprocess.create_train_file_cv(X[test_set], y[test_set], queries[test_set], fold_number, method, True) # if append_file: # print("appending train features") # run_bash_command("cat " + append_file + " >> " + train_file) for C in C_array: model_file = svm.learn_svm_rank_model(train_file, fold_number, C) weights = recover_model(model_file) svm.w = weights scores_file = svm.run_svm_rank_model(validation_file, model_file, fold_number) results = svm.retrieve_scores(validation_set, scores_file) score_file = evaluator.create_trec_eval_file( validation_set, queries, results, str(C), method, fold_number, True) score = evaluator.run_trec_eval(score_file, qrels_file) scores[C] = score models[C] = model_file max_C = max(scores.items(), key=operator.itemgetter(1))[0] print("on fold", fold_number, "chosen model:", max_C) chosen_model = models[max_C] total_models[fold_number] = chosen_model test_scores_file = svm.run_svm_rank_model(test_file, chosen_model, fold_number) results = svm.retrieve_scores(test_set, test_scores_file) trec_file = evaluator.create_trec_eval_file(test_set, queries, results, "", method, fold_number) fold_number += 1 final_trec_file = evaluator.order_trec_file(trec_file) run_bash_command("rm " + trec_file) # sum=[] # for i in total_models: # w = recover_model(total_models[i]) # print(w) # if sum==[]: # sum=w # else: # sum+=w # print(sum) # # average = sum/len(total_models) # print(average) # f = open(qrels_file+"_averaged_weights.pkl","wb") # pickle.dump(average,f) # f.close() if seo_scores: increase_rank_stats, cv_firsts = get_average_score_increase( seo_scores, final_trec_file) stats, significance_data_cv = evaluator.run_trec_eval_by_query( qrels_file, final_trec_file) random_significance_data, random_firsts = run_random_for_significance( features_file, qrels_file, "sig_test", seo_scores=seo_scores) sig_signs = discover_significance_relevance(significance_data_cv, random_significance_data) sig_signs = discover_significance_rank_promotior( cv_firsts, random_firsts, sig_signs) else: increase_rank_stats = False sig_signs = None evaluator.run_trec_eval_on_test(qrels_file, summary_file, method, None, increase_rank_stats, sig_signs) del X del y del queries return final_trec_file
def cross_validation(features_file, qrels_file, summary_file, append_file=""): preprocess = p.preprocess() X, y, queries = preprocess.retrieve_data_from_file(features_file, True) number_of_queries = len(set(queries)) print("there are ", number_of_queries, 'queries') evaluator = e.eval() evaluator.create_index_to_doc_name_dict(features_file) folds = preprocess.create_folds(X, y, queries, 5) fold_number = 1 C_array = [0.1, 0.01, 0.0001] # C_array = [0.1, 0.01, 0.0001,1,10,100,10000] validated = set() scores = {} models = {} method = "svm_light" svm = s.svm_handler() for train, test in folds: evaluator.empty_validation_files(method) validated, validation_set, train_set = preprocess.create_validation_set( 5, validated, set(train), number_of_queries, queries) number_of_queries_in_fold = len(set(queries[train_set])) train_set = sorted(list(train_set)) validation_set = sorted(list(validation_set)) test_set = sorted(list(test)) train_file = preprocess.create_train_file(X[train_set], y[train_set], queries[train_set], fold_number, method) validation_file = preprocess.create_train_file(X[validation_set], y[validation_set], queries[validation_set], fold_number, method, True) test_file = preprocess.create_train_file_cv(X[test_set], y[test_set], queries[test_set], fold_number, method, True) if append_file: print("appending train features") run_bash_command("cat " + append_file + " >> " + train_file) for C in C_array: model_file = svm.learn_svm_light_model(train_file, fold_number, C, number_of_queries_in_fold) weights = recover_model(model_file) svm.w = weights scores_file = svm.run_svm_light_model(validation_file, model_file, fold_number) results = svm.retrieve_scores(validation_set, scores_file) score_file = evaluator.create_trec_eval_file( validation_set, queries, results, str(C), method, fold_number, True) score = evaluator.run_trec_eval(score_file, qrels_file) scores[C] = score models[C] = model_file max_C = max(scores.items(), key=operator.itemgetter(1))[0] print("on fold", fold_number, "chosen model:", max_C) chosen_model = models[max_C] test_scores_file = svm.run_svm_light_model(test_file, chosen_model, fold_number) results = svm.retrieve_scores(test_set, test_scores_file) trec_file = evaluator.create_trec_eval_file(test_set, queries, results, "", method, fold_number) fold_number += 1 evaluator.order_trec_file(trec_file) run_bash_command("rm " + trec_file) evaluator.run_trec_eval_on_test(qrels_file, summary_file, method)