def run_model(test_file,home_path,java_path,jar_path,score_file,model_path): java_path = home_path+"/"+java_path+"/bin/java" if not os.path.exists(os.path.dirname(score_file)): os.makedirs(os.path.dirname(score_file)) features = test_file run_bash_command('touch ' + score_file) command = java_path + " -jar " + jar_path + " -load " + model_path + " -rank " + features + " -score " + score_file out = run_bash_command(command) print(str(out)) return score_file
def create_features_file(features_dir, index_path, queries_file, new_features_file, working_set_file, scripts_path): """ Creates a feature file via a given index and a given working set file """ run_bash_command("rm -r " + features_dir) if not os.path.exists(features_dir): os.makedirs(features_dir) if not os.path.exists(os.path.dirname(new_features_file)): os.makedirs(os.path.dirname(new_features_file)) command = scripts_path + "LTRFeatures " + queries_file + ' -stream=doc -index=' + index_path + ' -repository=' + index_path + ' -useWorkingSet=true -workingSetFile=' + working_set_file + ' -workingSetFormat=trec' print(command) out = run_bash_command(command) print(out) run_bash_command("mv doc*_* " + features_dir) command = "perl " + scripts_path + "generate.pl " + features_dir + " " + working_set_file print(command) out = run_bash_command(command) print(out) command = "mv features " + new_features_file print(command) out = run_bash_command(command) print(out) run_bash_command("mv featureID " + os.path.dirname(new_features_file)) return new_features_file
def feature_creation_parallel(raw_dataset_file, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, queries, output_feature_files_dir,output_final_features_dir,workingset_file): global word_embd_model args = [qid for qid in queries] if not os.path.exists(output_feature_files_dir): os.makedirs(output_feature_files_dir) if not os.path.exists(output_final_features_dir): os.makedirs(output_final_features_dir) raw_ds = read_raw_ds(raw_dataset_file) create_ws(raw_ds,workingset_file,ref_doc_index) func = partial(create_features, raw_ds, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, queries, output_feature_files_dir) workers = cpu_count()-1 list_multiprocessing(args,func,workers=workers) command = "perl generateSentences.pl " + output_feature_files_dir+" "+workingset_file run_bash_command(command) run_bash_command("mv features "+output_final_features_dir)
def create_features_file_diif(features_dir, base_index_path, new_index_path, queries_file, new_features_file, working_set_file, scripts_path): """ Creates a feature file via a given index and a given working set file """ run_bash_command("rm -r " + features_dir) if not os.path.exists(features_dir): os.makedirs(features_dir) if not os.path.exists(os.path.dirname(new_features_file)): os.makedirs(os.path.dirname(new_features_file)) # command= scripts_path+"LTRFeatures "+ queries_file + ' -stream=doc -index=' + index_path + ' -repository='+ index_path +' -useWorkingSet=true -workingSetFile='+working_set_file + ' -workingSetFormat=trec' command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar LTRFeatures " + base_index_path + " " + new_index_path + " data/stopWordsList data/working_comp_queries.txt " + working_set_file + " " + features_dir print(command) out = run_bash_command(command) print(out) # run_bash_command("mv doc*_* "+features_dir) command = "perl " + scripts_path + "generate.pl " + features_dir + " " + working_set_file print(command) out = run_bash_command(command) print(out) command = "mv features " + new_features_file print(command) out = run_bash_command(command) print(out) run_bash_command("mv featureID " + os.path.dirname(new_features_file)) return new_features_file
def run_svm_rank_model(test_file, model_file, predictions_folder): if not os.path.exists(predictions_folder): os.makedirs(predictions_folder) predictions_file = predictions_folder + os.path.basename(model_file) command = "./svm_rank_classify " + test_file + " " + model_file + " " + predictions_file print("##Running command: "+command+"##") out = run_bash_command(command) print("Output of ranking command: "+str(out),flush=True) return predictions_file
def merge_indices(merged_index,new_index_name, base_index, home_path ='/home/greg/', indri_path ="indri_test"): """ merges two different indri indices into one """ # new_index_name = home_path +'/' + index_path +'/' + new_index_name if not os.path.exists(os.path.dirname(merged_index)): os.makedirs(os.path.dirname(merged_index)) command = home_path+"/"+indri_path+'/bin/dumpindex '+merged_index +' merge ' + new_index_name + ' ' + base_index print("##merging command:",command+"##",flush=True) out=run_bash_command(command) print("merging command output:"+str(out),flush=True) return new_index_name
def run_summarization_model(script_file,model_file,input_file,output_file,**kwargs): """ cmd example: nohup python ~/OpenNMT-py/translate.py --replace_unk -beam_size 10 --model ~/OpenNMT-py/sum_transformer_model_acc_57.25_ppl_9.22_e16.pt --src input_transformer.txt --output transformer_real_par2.txt --batch_size 1 -min_length 1 -gpu 0 & """ command = "python "+script_file+" --replace_unk -beam_size 10 --model "+model_file+" --src "+input_file+" --output "+output_file+" --batch_size 1 -gpu 0 " for key, value in kwargs.items(): command+="--"+key+" "+value+" " print("##Running summarization command: "+command+"##",flush=True) out = run_bash_command(command) print("Summarization output= "+str(out),flush=True)
def create_index(trec_text_file,index_path,new_index_name,home_path = '/home/greg/',indri_path = "indri_test"): """ Parse the trectext file given, and create an index. """ indri_build_index = home_path+'/'+indri_path+'/bin/IndriBuildIndex' corpus_path = trec_text_file corpus_class = 'trectext' memory = '1G' index = index_path+"/"+new_index_name if not os.path.exists(index_path): os.makedirs(index_path) stemmer = 'krovetz' if not os.path.exists(home_path+"/"+index_path): os.makedirs(home_path+"/"+index_path) command = indri_build_index + ' -corpus.path=' + corpus_path + ' -corpus.class=' + corpus_class + ' -index=' + index + ' -memory=' + memory + ' -stemmer.name=' + stemmer print("##Running IndriBuildIndex command ="+command+"##",flush=True) out=run_bash_command(command) print("IndriBuildIndex output:"+str(out),flush=True) return index
def create_features_file_diff(features_dir, base_index_path, new_index_path, new_features_file, working_set_file, scripts_path,java_path,swig_path,stopwords_file,queries_text_file,home_path): """ Creates a feature file via a given index and a given working set file """ run_bash_command("rm -r "+features_dir) if not os.path.exists(features_dir): os.makedirs(features_dir) if not os.path.exists(os.path.dirname(new_features_file)): os.makedirs(os.path.dirname(new_features_file)) command = home_path+java_path+"/bin/java -Djava.library.path="+swig_path+ " -cp seo_indri_utils.jar LTRFeatures "+base_index_path+" "+new_index_path+" "+stopwords_file+" "+queries_text_file+" "+working_set_file+" "+features_dir print(command) out = run_bash_command(command) print(out) command = "perl " + scripts_path + "generate.pl " + features_dir + " " + working_set_file print(command) out=run_bash_command(command) print(out) command = "mv features "+new_features_file print(command) out = run_bash_command(command) print(out) run_bash_command("mv featureID "+os.path.dirname(new_features_file)) return new_features_file
def create_sentence_vector_files(output_dir, raw_ds_file, index_path,java_path,swig_path,home_path): command = home_path+java_path+"/bin/java -Djava.library.path="+swig_path+" -cp seo_indri_utils.jar PrepareTFIDFVectorsSentences "+index_path+" "+raw_ds_file+" "+output_dir logger.info("##Running command: "+command+"##") out = run_bash_command(command) logger.info("Command output: "+str(out))
def create_sentence_vector_files(output_dir, raw_ds_file, index_path): command = " ~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar PrepareTFIDFVectorsSentences " + index_path + " " + raw_ds_file + " " + output_dir logger.info("##Running command: " + command + "##") out = run_bash_command(command) logger.info("Command output: " + str(out))
ref_index ) + " --top_docs_index=" + str( number_of_top_docs ) + " --doc_tfidf_dir=asr_tfidf_vectors/ --sentences_tfidf_dir=sentences_tfidf_vectors/ --queries_file=data/queries_seo_exp.xml --scores_dir=scores_bot_weighted_0_" + str( ref_index ) + "/ --trec_file=trecs/trec_file_original_sorted.txt --sentence_trec_file=trecs/bot_weighted_0_sentence_trec_file_" + str( ref_index ) + ".txt --output_feature_files_dir=Features_bot_weighted_0_" + str( ref_index ) + "/ --output_final_feature_file_dir=features_bot_weighted_0_" + str( ref_index ) + "/ --trectext_file=data/documents.trectext --new_trectext_file=data/bot_weighted_0_documents_" + str( ref_index ) + ".trectext --model_file=bot_exp_utils/word2vec_model --svm_model_file=bot_exp_utils/weighted_0 --workingset_file=data/workingset_bot_pairs_" + str( ref_index) out = run_bash_command(bot_command) print(out, flush=True) rerank_command = "python reranking_process.py --mode=all --features_dir=Features_bot_weighted_0_post_" + str( ref_index ) + "/ --merged_index=merged_indices/merged_index/ --queries_file=data/queries_seo_exp.xml --new_features_file=final_features_dir/features_bot_weighted_0_post_" + str( ref_index ) + " --workingset_file=data/workingset_original --scripts_path=scripts/ --java_path=jdk1.8.0_181 --jar_path=scripts/RankLib.jar --score_file=scores/scores_bot_weighted_0_post_" + str( ref_index ) + ".txt --model_file=rank_models/model_bot --trec_file=trecs/trec_file_bot_weighted_0_post_" + str( ref_index ) + " --trectext_file=data/bot_weighted_0_documents_" + str( ref_index ) + ".trectext --home_path=~/ --base_index=merged_indices/merged_index --new_index=new_indices/all_doc_bot_weighted_0_" + str( ref_index) + " --indri_path=work_indri" out = run_bash_command(rerank_command) print(out, flush=True)
) + ".txt --trec_file=trecs/trec_file_original_sorted.txt --number_of_top_docs=3 --trectext_file=data/documents.trectext --new_trectext_file=data/updated_documents_" + str( ref_index ) + ".trectext --new_ws_file=data/updated_workingset --model_file=/lv_local/home/sgregory/textGen/summarization/wiki.en.bin" # rerank_command = "python ranking_process.py --mode=all --features_dir=Features_post_"+str(ref_index)+" --merged_index=merged_indices/merged_index_post --queries_file=data/queries_seo_exp.xml --new_features_file=final_features_dir/features_post_"+str(ref_index)+" --workingset_file=data/workingset_original --scripts_path=scripts/ --java_path=jdk1.8.0_181 --jar_path=scripts/RankLib.jar --score_file=scores/scores_post_"+str(ref_index)+".txt --model_file=rank_models/model_bot --trec_file=trecs/trec_file_post_"+str(ref_index)+" --trectext_file=data/updated_documents_"+str(ref_index)+".trectext --home_path=~/ --base_index=~/cluewebindex --new_index=new_indices/all_doc_index_post_"+str(ref_index)+" --indri_path=work_indri" rerank_command = "python reranking_process.py --mode=all --features_dir=Features_post_" + str( ref_index ) + "/ --merged_index=merged_indices/merged_index --queries_file=data/queries_seo_exp.xml --new_features_file=final_features_dir/features_post_" + str( ref_index ) + " --workingset_file=data/workingset_original --scripts_path=scripts/ --java_path=jdk1.8.0_181 --jar_path=scripts/RankLib.jar --score_file=scores/scores_post_" + str( ref_index ) + ".txt --model_file=rank_models/model_bot --trec_file=trecs/trec_file_post_" + str( ref_index ) + " --trectext_file=data/updated_documents_" + str( ref_index ) + ".trectext --home_path=~/ --base_index=~/cluewebindex --new_index=new_indices/all_doc_index_post_" + str( ref_index) + " --indri_path=work_indri" out = run_bash_command(prep_vectors_command) print(out, flush=True) out = run_bash_command(summarization_command) print(out, flush=True) run_bash_command("rm -r summary_vectors/") out = run_bash_command(vectors_command) print(out, flush=True) out = run_bash_command(updata_text_command) print(out, flush=True) # run_bash_command("rm -r merged_indices/merged_index_post") out = run_bash_command(rerank_command) print(out, flush=True)
def order_trec_file(trec_file): final = trec_file.replace(".txt", "_sorted.txt") command = "sort -k1,1n -k5nr -k2,1 " + trec_file + " > " + final print(command) run_bash_command(command) return final
logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) parser = OptionParser() parser.add_option("--trec_file", dest="trec_file") parser.add_option("--trectext_file", dest="trectext_file") parser.add_option("--ref_index", dest="ref_index") parser.add_option("--index", dest="index") parser.add_option("--sentences_out_file", dest="sentences_out_file") parser.add_option("--vectors_output_dir", dest="vectors_output_dir") (options, args) = parser.parse_args() if not os.path.exists(options.vectors_output_dir): os.makedirs(options.vectors_output_dir) if not os.path.exists(os.path.dirname(options.sentences_out_file)): os.makedirs(os.path.dirname(options.sentences_out_file)) reference_docs = get_reference_docs(options.trec_file, int(options.ref_index)) document_text = load_file(options.trectext_file) with open(options.sentences_out_file, 'w') as out_file: for query in reference_docs: doc = reference_docs[query] doc_text = document_text[doc] sentences = nltk.sent_tokenize(doc_text) for i, sentence in enumerate(sentences): out_file.write(query + "\t" + doc + "_" + str(i) + "\t" + sentence.rstrip().replace("\n", "") + "\n") command = " ~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar PrepareTFIDFVectorsReferenceDocs " + options.index + " " + options.sentences_out_file + " " + options.vectors_output_dir logger.info("## Running vector creation command: " + command + " ##") logger.info(run_bash_command(command)) logger.info("Vector creation is DONE..")
from gen_utils import run_bash_command for ref_index in [1, 2, 3, 4]: number_of_top_docs = str(min(3, ref_index)) raw_summary_command = "python create_raw_ds_summaries.py " + str(ref_index) out = run_bash_command(raw_summary_command) print(out, flush=True) vectors_command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar PrepareTFIDFVectorsSummaries merged_indices/merged_index/ docs.txt competition_doc_summaries summary_vectors_docs_" + str( ref_index) + "/" out = run_bash_command(vectors_command) print(out, flush=True) bot_summaries_command = "python bot_execution_summaries.py --mode=all --index_path=merged_indices/merged_index --raw_ds_out=data/raw_bot_summary_" + str( ref_index ) + ".txt --ref_index=" + str( ref_index ) + " --top_docs_index=" + number_of_top_docs + " --doc_tfidf_dir=asr_tfidf_vectors/ --sentences_tfidf_dir=sentence_ref_vectors_" + str( ref_index ) + "/ --summary_tfidf_dir=summary_vectors_docs_" + str( ref_index ) + "/ --queries_file=data/queries_seo_exp.xml --scores_dir=scores_summary_bot_1_" + str( ref_index ) + "/ --trec_file=trecs/trec_file_original_sorted.txt --sentence_trec_file=trecs/bot_summary_1_trec_file_" + str( ref_index ) + ".txt --output_feature_files_dir=Features_bot_1_summary_" + str( ref_index ) + "/ --output_final_feature_file_dir=features_bot_summary_1_" + str( ref_index ) + "/ --trectext_file=data/documents.trectext --new_trectext_file=data/bot_summary_1_documents_" + str( ref_index ) + ".trectext --model_file=/lv_local/home/sgregory/textGen/summarization/seo_experiment/bot_exp_utils/word2vec_model --svm_model_file=bot_exp_utils/harmonic_competition_model_all_data --workingset_file=data/workingset_bot_summary" out = run_bash_command(bot_summaries_command)