def summarization_ds(options): global model sum_model = options.sum_model logger.info("reading queries file") raw_queries = read_queries_file(options.queries_file) logger.info("reading trec file") ranked_lists = rtf(options.trec_file) logger.info("transforming queries") queries = transform_query_text(raw_queries) logger.info("reading trectext file") doc_texts = load_file(options.trectext_file) logger.info("calculating reference docs") reference_docs = get_reference_docs(options.trec_file, int(options.ref_index)) logger.info("calculating top docs") top_docs = get_top_docs(options.trec_file, int(options.number_of_top_docs)) logger.info("calculating sentences for replacement") senteces_for_replacement = get_sentences_for_replacement( doc_texts, reference_docs, queries, options.sentences_vectors_dir, options.documents_vectors_dir, top_docs, ranked_lists, int(options.starting_epoch), model) logger.info("writing input sentences file") input_file = write_input_dataset_file(senteces_for_replacement, reference_docs, doc_texts, options.suffix) logger.info("writing all files") return parrallel_create_summarization_task(input_file, queries, sum_model, doc_texts, reference_docs, top_docs, options.documents_vectors_dir, ranked_lists, options.suffix)
parser.add_option("--queries_file", dest="queries_file") parser.add_option("--scores_dir", dest="scores_dir") parser.add_option("--trec_file", dest="trec_file") parser.add_option("--sentence_trec_file", dest="sentence_trec_file") parser.add_option("--output_feature_files_dir", dest="output_feature_files_dir") parser.add_option("--output_final_feature_file_dir", dest="output_final_feature_file_dir") parser.add_option("--trectext_file", dest="trectext_file") parser.add_option("--new_trectext_file", dest="new_trectext_file") parser.add_option("--model_file", dest="model_file") parser.add_option("--workingset_file", dest="workingset_file") parser.add_option("--svm_model_file", dest="svm_model_file") (options, args) = parser.parse_args() ranked_lists = read_trec_file(options.trec_file) doc_texts = load_file(options.trectext_file) mode = options.mode if mode == "qrels": create_raw_dataset(ranked_lists, doc_texts, options.raw_ds_out, int(options.ref_index), int(options.top_docs_index)) create_sentence_vector_files(options.sentences_tfidf_dir, options.raw_ds_out, options.index_path) # raw_ds = read_raw_ds(options.raw_ds_out) create_qrels(options.raw_ds_out, options.trec_file, "qrels_seo_bot" + options.ref_index + ".txt", int(options.ref_index), "qrels_indices/", doc_texts, options) if mode == "features": queries = read_queries_file(options.queries_file) queries = transform_query_text(queries)
"reference"] legends = ["reference", "next", "top", "post"] colors = ['b', 'r', "k", "y"] for f in feature_list: ys = [[ original_features_stats[f][k][e] for e in sorted(list(original_features_stats[f][k].keys())) ] for k in legends] x = sorted(list(original_features_stats[f]["next"].keys())) plot_metric(ys, x, f.lower().replace('.', ''), f.replace('.', ''), "Epochs", legends, colors) scores = read_trec_scores(trec) post_scores = read_trec_scores(post_trec) doc_texts = load_file("../data/documents.trectext") updated_doc_texts = load_file("../data/updated_documents.trectext") stats = doc_frequency_eval(ranked_lists, queries, doc_texts) updated_stats = doc_frequency_eval(ranked_lists, queries, updated_doc_texts) stats["post"] = updated_stats["reference"] legends = ["reference", "next", "top", "post"] colors = ['b', 'r', "k", "y"] ys = [[stats[k][e] for e in sorted(list(stats[k].keys()))] for k in legends] x = sorted(list(stats["reference"].keys())) plot_metric(ys, x, "plt/qtf_comp_docs", "Avg QTF", "Epochs", legends, colors) stats = compare_scores(scores, ranked_lists)
from summarization.seo_experiment.utils import load_file data = load_file("data/documents.trectext") queries = set() for doc in data: query = doc.split("-")[2] queries.add(query) with open("data/queries.txt") as f1: with open("data/queries_comp.txt",'w') as f2: for line in f1: if line.split(":")[0] in queries: f2.write(line)
if __name__ == "__main__": trectext_file_prefix = sys.argv[1] trec_file = sys.argv[2] fname_addition = sys.argv[3] starting_epoch = int(sys.argv[4]) for ref_index in ["1", "2", "3", "4"]: final_trec_name = "trecs/trec_file_" + fname_addition + "_post_" + str( ref_index) if os.path.exists(final_trec_name): os.remove(final_trec_name) for r in range(starting_epoch, 8): trectext_fname = trectext_file_prefix + "_" + ref_index + ".trectext" trectext_fname_new = trectext_file_prefix + "_" + ref_index + "_" + str( r) + "_new.trectext" trectext_file_for_read = fix_xml_file(trectext_fname) texts = load_file(trectext_file_for_read) original_texts = load_file("data/documents.trectext") ranked_lists = read_trec_file(trec_file) ref_docs = get_ref_docs(ranked_lists, int(ref_index)) workingset_fname = "data/dynamic_experiment_workingset_" + ref_index + "_" + str( r) + ".txt" workingset_docs = create_working_set(ref_docs, texts, r, r + 1, workingset_fname) create_trectext_dynamic(texts, original_texts, workingset_docs, trectext_fname_new) tmp_trec_file = run_reranking(workingset_fname, fname_addition, r, trectext_fname_new) append_to_file(tmp_trec_file, final_trec_name) os.remove(tmp_trec_file) order_trec_file(final_trec_name) os.remove(final_trec_name)
logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) parser = OptionParser() parser.add_option("--doc_tfidf_dir", dest="doc_tfidf_dir") parser.add_option("--summaries_tfidf_dir", dest="summaries_tfidf_dir") parser.add_option("--queries_file", dest="queries_file") parser.add_option("--summaries_file", dest="summaries_file") parser.add_option("--input_data_file", dest="input_data_file") parser.add_option("--trec_file", dest="trec_file") parser.add_option("--number_of_top_docs", dest="number_of_top_docs") parser.add_option("--trectext_file", dest="trectext_file") parser.add_option("--new_trectext_file", dest="new_trectext_file") parser.add_option("--new_ws_file", dest="new_ws_file") parser.add_option("--model_file", dest="model_file") (options, args) = parser.parse_args() summary_stats, summary_tfidf_fname_index, replacement_indexes, queries_text, reference_docs = read_summaries_data( options.summaries_file, options.input_data_file, options.summaries_tfidf_dir, options.queries_file) document_texts = load_file(options.trectext_file) ranked_lists = read_trec_file(options.trec_file) # model = gensim.models.FastText.load_fasttext_format(options.model_file) model = gensim.models.KeyedVectors.load_word2vec_format(options.model_file, binary=True, limit=700000) updated_texts = update_texts_with_replacement_summary( replacement_indexes, summary_stats, ranked_lists, options.doc_tfidf_dir, queries_text, document_texts, options.trec_file, int(options.number_of_top_docs), summary_tfidf_fname_index, reference_docs, model) create_trectext(updated_texts, options.new_trectext_file, options.new_ws_file)
from summarization.seo_experiment.utils import load_file,read_trec_file,run_summarization_model from summarization.seo_experiment.borda_mechanism import read_queries import nltk def create_summarization_ds(ranked_lists,texts): with open("summarization_data.txt",'w',encoding="utf-8") as sum_data: with open("texts_for_summary.txt","w",encoding="utf-8") as text_data: for r in range(1,7): epoch = str(r).zfill(2) for query in ranked_lists[epoch]: winner = ranked_lists[epoch][query][0] text = texts[winner] sentences = nltk.sent_tokenize(text) line = " ".join(["<t> "+s.replace("\n","")+" </t>" for s in sentences])+"\n" text_data.write(line) sum_data.write(epoch+"\t"+query+"\t"+winner+"\n") if __name__=="__main__": trectext_file = "../data/documents.trectext" # queries_file = "../data/queries.txt" trec_file = "../trecs/trec_file_original_sorted.txt" summary_kwargs = {"lstm":{"min_length" :"10","block_ngram_repeat": "2"},"transformer":{"min_length" :"3"}} ranked_lists = read_trec_file(trec_file) texts = load_file(trectext_file) create_summarization_ds(ranked_lists,texts) run_summarization_model("~/OpenNMT-py/translate.py","../summarization_models/sum_transformer_model_acc_57.25_ppl_9.22_e16.pt","texts_for_summary.txt","top_docs_summaries.txt",**summary_kwargs["transformer"])
from summarization.seo_experiment.utils import load_file import nltk import numpy as np texts = load_file("../data/documents.trectext") stats = {} for doc in texts: r = doc.split("-")[1] if r not in ["06", "07"]: continue if r not in stats: stats[r] = [] stats[r].append(len(nltk.sent_tokenize(texts[doc]))) for r in stats: stats[r] = np.mean(stats[r]) print(stats)