new_texts = {} print("starting summarization") for query in reference_docs: print("in", query) sys.stdout.flush() reference_doc = reference_docs[query] new_texts[reference_docs[query]] = create_new_document_by_weaving( doc_texts[reference_doc], queries[query], threshold) print("finished summarization") summary_file = open("new_texts" + run_name, "wb") pickle.dump(new_texts, summary_file) summary_file.close() reference_docs_list = list(reference_docs.values()) create_trectext(doc_texts, reference_docs_list, new_texts, run_name) index_path = create_index(run_name) print("merging indices") sys.stdout.flush() new_index_name = merge_indices(index_path, run_name) features_dir = "Features" + run_name feature_file = "features" wait_for_feature_file_to_be_deleted(feature_file) create_features_file(features_dir, new_index_name, params.queries_xml, run_name) move_feature_file(feature_file, run_name) index_doc_name = create_index_to_doc_name_dict(feature_file + run_name) scores_file = run_model(feature_file + run_name, run_name) results = retrieve_scores(index_doc_name, scores_file) results_file = open("scores_of_model" + run_name, "wb") pickle.dump(results, results_file)
reference_sentence = reference_sentence.replace("\n", "") if not reference_sentence: continue modified_doc = reference_doc + "\n" + new_sentence summaries[reference_doc] = modified_doc add = open("/home/greg/auto_seo/scripts/add_remove_4_test", 'w', encoding="utf8") add.write(reference_doc + "@@@" + new_sentence.rstrip() + "@@@" + reference_sentence.rstrip() + "\n") sentence_data_file.write(run_name + "@@@" + new_sentence.rstrip() + "@@@" + reference_sentence.rstrip() + "\n") add.close() time.sleep(1) trec_text_file = create_trectext(doc_texts, summaries, "", []) features_dir = "Features_4" feature_file = "features_4_" + run_name create_features_file( features_dir, params.path_to_index, params.queries_xml, feature_file, "/home/greg/auto_seo/scripts/add_remove_4_test", "") index_doc_name = create_index_to_doc_name_dict(feature_file) scores_file = run_model(feature_file) results = retrieve_scores(index_doc_name, scores_file) lists = create_lists(results) addition = max(3 - lists[query].index(reference_doc), 0) query = sentence.split("-")[2] labels_file.write(query + " 1 " + run_name + " " + str(addition) + " seo" + "\n") r_index += 1
from Preprocess.preprocess import load_file from Experiments.experiment_data_processor import create_trectext, merge_indexes_for_experiments, create_index, create_working_sets_by_round import params a_doc_texts = load_file(params.trec_text_file) doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04") or doc.__contains__("ROUND-06"): doc_texts[doc] = a_doc_texts[doc] trec_text_file = "trec_text_sentnece_experiments" create_trectext(doc_texts, [], trec_text_file, "dummy") working_set_file_basename = "working_set_sentence_experiments" create_working_sets_by_round(doc_texts, working_set_file_basename) current_index = create_index(trec_text_file, "sentence_experiment") base_index = "/home/greg/cluewebindex" merge_indexes_for_experiments(merged_index="/home/greg/mergedindex", index1=base_index, index2=current_index)
import pickle def retrieve_query_names(): query_mapper = {} with open(params.query_description_file, 'r') as file: for line in file: data = line.split(":") query_mapper[data[0]] = data[1].rstrip() return query_mapper if __name__ == "__main__": ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) reference_docs = { q: ranked_lists[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists } print(reference_docs) winner_docs = {q: ranked_lists[q][0] for q in ranked_lists} a_doc_texts = load_file(params.trec_text_file) doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04") or doc.__contains__("ROUND-06"): doc_texts[doc] = a_doc_texts[doc] trec_text_file = create_trectext(doc_texts, [], "trec_text_round_4_6", "ws4_6") added_index = create_index(trec_text_file, "4_6") merged_index = merge_indices(added_index, "", "/home/greg/mergedindex")
from Preprocess.preprocess import load_file from Experiments.experiment_data_processor import create_trectext import params a_doc_texts = load_file(params.trec_text_file) doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04"): doc_texts[doc] = a_doc_texts[doc] summaries = {} create_trectext(document_text=doc_texts, avoid=[], summaries=summaries, run_name="")
summaries={} print("starting summarization") for query in reference_docs: print("in",query ) sys.stdout.flush() reference_doc=reference_docs[query] summaries[reference_docs[query]] = create_multi_document_summarization(ranked_lists,query,queries[query],reference_doc,number_of_documents_above,gamma,doc_texts,index,token2id,dic,id2df,run_name) print("finished summarization") summary_file = open("summaries"+run_name,"wb") pickle.dump(summaries,summary_file) summary_file.close() del index del token2id del id2df del dic create_trectext(doc_texts,summaries,run_name) index_path = create_index(run_name) print("merging indices") sys.stdout.flush() new_index_name = merge_indices(index_path,run_name) features_dir = "Features"+run_name feature_file="features" wait_for_feature_file_to_be_deleted(feature_file) create_features_file(features_dir,new_index_name,params.queries_xml,run_name) move_feature_file(feature_file,run_name) index_doc_name = create_index_to_doc_name_dict(feature_file+run_name) scores_file=run_model(feature_file+run_name,run_name) results=retrieve_scores(index_doc_name,scores_file) results_file = open("scores_of_model"+run_name,"wb") pickle.dump(results,results_file) results_file.close()