def run_bots_and_rerank(method, doc_texts, new_features_file, new_qrels_file, sentences, reference_docs, seo_features_file, dummy_scores, labels_file, beta="-"): chosen_models_file_name = "chosen_models_" + method chosen_models = read_chosen_model_file(chosen_models_file_name) # final_trec_file = run_chosen_model_for_stats(chosen_models, method, new_features_file, doc_name_index, # new_features_file) final_trec_file = cross_validation(new_features_file, new_qrels_file, "summary_labels_" + method + ".tex", "svm_rank", ["map", "ndcg", "P.2", "P.5"], "") best_sentences = pick_best_sentences(final_trec_file) for query in reference_docs: if query in banned_queries or query not in best_sentences: continue doc = reference_docs[query] if query == "180": print(doc_texts[doc], flush=True) chosen_comb = best_sentences[query] doc_texts = save_modified_file(doc_texts, sentences, chosen_comb, doc) if query == "180": print(doc_texts[doc], flush=True) new_coherence_features_set, max_min_stats = create_coherency_features( ref_index=-1, ranked_list_new_file="ranked_lists/trec_file04", doc_text_modified=doc_texts) rewrite_fetures(dummy_scores, new_coherence_features_set, seo_features_file, new_features_file + "_exp", coherency_features, "dummy_q", max_min_stats) doc_name_index = create_index_to_doc_name_dict(new_features_file + "_exp") final_trec_file = run_chosen_model_for_stats(chosen_models, method, new_features_file + "_exp", doc_name_index, new_features_file, str(beta)) new_best_sentences = pick_best_sentences(final_trec_file, best_sentences) print(new_best_sentences, flush=True) for query in reference_docs: if query in banned_queries or query not in best_sentences: continue reference_doc = reference_docs[query] write_add_remove_file(add_remove_file, new_best_sentences, query, sentences, reference_doc) run_reranking(reference_doc, query, labels_file, add_remove_file, beta)
def create_features_for_doc_and_run_model(reference_docs, current_time, past_winners_file, doc_ids_file, model_index, index_path, method_index): print("loading w2v model") model = load_model() print("loading done") for query in reference_docs: print("working on", query) for doc in reference_docs[query]: print("working on", doc) top_docs_file, first = create_top_docs_per_ref_doc( current_time, doc, query) if first: continue print("top_doc_file is created") sentence_file_name, sentences_index = create_sentence_file( top_docs_file, doc, query, current_time) print("sentence_file is created") working_set_file = create_sentence_working_set( doc, current_time, sentence_file_name, query) print("sentence working-set is created") create_w2v_features(sentence_file_name, top_docs_file, doc_ids_file, past_winners_file, model, query) print("created seo w2v features") create_coherency_features(sentences_index, doc, query, model) print("created coherency features") final_features_dir = "sentence_feature_files/" + current_time + "/" features_file = final_features_dir + query + "_" + doc + "_" + current_time features_dir = "sentence_feature_values/" + current_time + "/" + query + "_" + doc + "/" if not os.path.exists(features_dir): os.makedirs(features_dir) if not os.path.exists(final_features_dir): os.makedirs(final_features_dir) create_tfidf_features_and_features_file( working_set_file, features_file, features_dir, index_path, sentence_file_name, top_docs_file, query, past_winners_file) print("created tf-idf features") model_file = model_index[method_index[query + "-" + doc]] doc_name_index = create_index_to_doc_name_dict(features_file) print("created doc name index") trec_file = run_svm_model(features_file, model_file, doc_name_index, query, doc, current_time) print("ran seo model") best_comb = pick_best_sentence_pair(trec_file) print(best_comb) sentence_in, sentence_out = get_sentences_for_replacement( best_comb, sentences_index, doc, query) replace_sentences_and_save_doc(doc, query, sentence_in, sentence_out) print("replaced sentences")
def run_reranking(reference_doc, query, labels_file, add_remove_file, beta="-"): features_dir = "Features" feature_file = "features_" + query create_features_file(features_dir, params.path_to_index, params.queries_xml, feature_file, add_remove_file, "") index_doc_name = create_index_to_doc_name_dict(feature_file) scores_file = run_model(feature_file) results = retrieve_scores(index_doc_name, scores_file) lists = create_lists(results) addition = abs(lists[query].index(reference_doc)) labels_file.write(query + " 1 " + beta + " " + str(addition) + "\n")
doc_texts = load_file(params.trec_text_file) merged_index = "" for index in range(1, 6): print("in epoch", index) doc_text_for_round = get_docs(doc_texts, round=index) trec_text_file = create_trectext_original(document_text=doc_text_for_round, summaries=[], run_name=str(index), avoid=[]) new_index = create_index(trec_text_file, str(index)) if merged_index: run_bash_command("rm -r " + merged_index) merged_index = merge_indices(new_index=new_index, run_name=str(index), new_index_name="merged_index") feature_file = "features" + "_" + str(index) features_dir = "Features" queries_file = "/home/greg/auto_seo/data/queries.xml" create_features_file_original(features_dir=features_dir, index_path=merged_index, new_features_file=feature_file, run_name=str(index), queries_file=queries_file) index_doc_name = create_index_to_doc_name_dict(feature_file) scores_file = run_model(feature_file, str(index)) results = retrieve_scores(index_doc_name, scores_file) trec_file = create_trec_eval_file(results, str(index)) order_trec_file(trec_file)
doc_texts[doc] = a_doc_texts[doc] # summaries={} # print("starting summarization") # for query in reference_docs: # print("in",query ) # sys.stdout.flush() # reference_doc=reference_docs[query] # summaries[query] = create_multi_document_summarization(ranked_lists,query,queries[query],reference_doc,params.number_of_documents_above,doc_texts,index,token2id,dic,id2df) # print("finished summarization") # summary_file = open("summaries","wb") # pickle.dump(summaries,summary_file) # summary_file.close() # reference_docs_list = list(reference_docs.values()) # create_trectext(doc_texts,reference_docs_list,summaries) # index_path = create_index() # print("merging indices") # sys.stdout.flush() # merge_indices(index_path) features_dir,index_path,queries_file create_features_file("Features", "/lv_local/home/sgregory/auto_seo/new_merged_index", "/lv_local/home/sgregory/auto_seo/data/queries.xml") index_doc_name = create_index_to_doc_name_dict("features") scores_file = run_model("features") results = retrieve_scores(index_doc_name, scores_file) results_file = open("scores_of_model", "wb") pickle.dump(results, results_file) results_file.close()