new_texts = {}
    print("starting summarization")
    for query in reference_docs:
        print("in", query)
        sys.stdout.flush()
        reference_doc = reference_docs[query]
        new_texts[reference_docs[query]] = create_new_document_by_weaving(
            doc_texts[reference_doc], queries[query], threshold)
    print("finished summarization")
    summary_file = open("new_texts" + run_name, "wb")
    pickle.dump(new_texts, summary_file)
    summary_file.close()

    reference_docs_list = list(reference_docs.values())
    create_trectext(doc_texts, reference_docs_list, new_texts, run_name)
    index_path = create_index(run_name)
    print("merging indices")
    sys.stdout.flush()
    new_index_name = merge_indices(index_path, run_name)
    features_dir = "Features" + run_name
    feature_file = "features"
    wait_for_feature_file_to_be_deleted(feature_file)
    create_features_file(features_dir, new_index_name, params.queries_xml,
                         run_name)
    move_feature_file(feature_file, run_name)
    index_doc_name = create_index_to_doc_name_dict(feature_file + run_name)
    scores_file = run_model(feature_file + run_name, run_name)
    results = retrieve_scores(index_doc_name, scores_file)
    results_file = open("scores_of_model" + run_name, "wb")
    pickle.dump(results, results_file)
 reference_sentence = reference_sentence.replace("\n", "")
 if not reference_sentence:
     continue
 modified_doc = reference_doc + "\n" + new_sentence
 summaries[reference_doc] = modified_doc
 add = open("/home/greg/auto_seo/scripts/add_remove_4_test",
            'w',
            encoding="utf8")
 add.write(reference_doc + "@@@" + new_sentence.rstrip() +
           "@@@" + reference_sentence.rstrip() + "\n")
 sentence_data_file.write(run_name + "@@@" +
                          new_sentence.rstrip() + "@@@" +
                          reference_sentence.rstrip() + "\n")
 add.close()
 time.sleep(1)
 trec_text_file = create_trectext(doc_texts, summaries, "", [])
 features_dir = "Features_4"
 feature_file = "features_4_" + run_name
 create_features_file(
     features_dir, params.path_to_index, params.queries_xml,
     feature_file,
     "/home/greg/auto_seo/scripts/add_remove_4_test", "")
 index_doc_name = create_index_to_doc_name_dict(feature_file)
 scores_file = run_model(feature_file)
 results = retrieve_scores(index_doc_name, scores_file)
 lists = create_lists(results)
 addition = max(3 - lists[query].index(reference_doc), 0)
 query = sentence.split("-")[2]
 labels_file.write(query + " 1 " + run_name + " " +
                   str(addition) + " seo" + "\n")
 r_index += 1
Esempio n. 3
0
from Preprocess.preprocess import load_file
from Experiments.experiment_data_processor import create_trectext, merge_indexes_for_experiments, create_index, create_working_sets_by_round
import params

a_doc_texts = load_file(params.trec_text_file)
doc_texts = {}
for doc in a_doc_texts:
    if doc.__contains__("ROUND-04") or doc.__contains__("ROUND-06"):
        doc_texts[doc] = a_doc_texts[doc]
trec_text_file = "trec_text_sentnece_experiments"
create_trectext(doc_texts, [], trec_text_file, "dummy")
working_set_file_basename = "working_set_sentence_experiments"
create_working_sets_by_round(doc_texts, working_set_file_basename)
current_index = create_index(trec_text_file, "sentence_experiment")
base_index = "/home/greg/cluewebindex"
merge_indexes_for_experiments(merged_index="/home/greg/mergedindex",
                              index1=base_index,
                              index2=current_index)
Esempio n. 4
0
import pickle


def retrieve_query_names():
    query_mapper = {}
    with open(params.query_description_file, 'r') as file:
        for line in file:
            data = line.split(":")
            query_mapper[data[0]] = data[1].rstrip()
    return query_mapper


if __name__ == "__main__":
    ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
    reference_docs = {
        q: ranked_lists[q][-1].replace("EPOCH", "ROUND")
        for q in ranked_lists
    }
    print(reference_docs)
    winner_docs = {q: ranked_lists[q][0] for q in ranked_lists}
    a_doc_texts = load_file(params.trec_text_file)
    doc_texts = {}
    for doc in a_doc_texts:
        if doc.__contains__("ROUND-04") or doc.__contains__("ROUND-06"):
            doc_texts[doc] = a_doc_texts[doc]
    trec_text_file = create_trectext(doc_texts, [], "trec_text_round_4_6",
                                     "ws4_6")

    added_index = create_index(trec_text_file, "4_6")
    merged_index = merge_indices(added_index, "", "/home/greg/mergedindex")
Esempio n. 5
0
from Preprocess.preprocess import load_file
from Experiments.experiment_data_processor import create_trectext
import params

a_doc_texts = load_file(params.trec_text_file)
doc_texts = {}
for doc in a_doc_texts:
    if doc.__contains__("ROUND-04"):
        doc_texts[doc] = a_doc_texts[doc]

summaries = {}

create_trectext(document_text=doc_texts, avoid=[], summaries=summaries, run_name="")
 summaries={}
 print("starting summarization")
 for query in reference_docs:
     print("in",query )
     sys.stdout.flush()
     reference_doc=reference_docs[query]
     summaries[reference_docs[query]] = create_multi_document_summarization(ranked_lists,query,queries[query],reference_doc,number_of_documents_above,gamma,doc_texts,index,token2id,dic,id2df,run_name)
 print("finished summarization")
 summary_file = open("summaries"+run_name,"wb")
 pickle.dump(summaries,summary_file)
 summary_file.close()
 del index
 del token2id
 del id2df
 del dic
 create_trectext(doc_texts,summaries,run_name)
 index_path = create_index(run_name)
 print("merging indices")
 sys.stdout.flush()
 new_index_name = merge_indices(index_path,run_name)
 features_dir = "Features"+run_name
 feature_file="features"
 wait_for_feature_file_to_be_deleted(feature_file)
 create_features_file(features_dir,new_index_name,params.queries_xml,run_name)
 move_feature_file(feature_file,run_name)
 index_doc_name = create_index_to_doc_name_dict(feature_file+run_name)
 scores_file=run_model(feature_file+run_name,run_name)
 results=retrieve_scores(index_doc_name,scores_file)
 results_file = open("scores_of_model"+run_name,"wb")
 pickle.dump(results,results_file)
 results_file.close()