def run_bots_and_rerank(method,
                        doc_texts,
                        new_features_file,
                        new_qrels_file,
                        sentences,
                        reference_docs,
                        seo_features_file,
                        dummy_scores,
                        labels_file,
                        beta="-"):
    chosen_models_file_name = "chosen_models_" + method
    chosen_models = read_chosen_model_file(chosen_models_file_name)

    # final_trec_file = run_chosen_model_for_stats(chosen_models, method, new_features_file, doc_name_index,
    #                                              new_features_file)
    final_trec_file = cross_validation(new_features_file, new_qrels_file,
                                       "summary_labels_" + method + ".tex",
                                       "svm_rank",
                                       ["map", "ndcg", "P.2", "P.5"], "")
    best_sentences = pick_best_sentences(final_trec_file)

    for query in reference_docs:
        if query in banned_queries or query not in best_sentences:
            continue

        doc = reference_docs[query]
        if query == "180":
            print(doc_texts[doc], flush=True)
        chosen_comb = best_sentences[query]
        doc_texts = save_modified_file(doc_texts, sentences, chosen_comb, doc)
        if query == "180":
            print(doc_texts[doc], flush=True)

    new_coherence_features_set, max_min_stats = create_coherency_features(
        ref_index=-1,
        ranked_list_new_file="ranked_lists/trec_file04",
        doc_text_modified=doc_texts)
    rewrite_fetures(dummy_scores, new_coherence_features_set,
                    seo_features_file, new_features_file + "_exp",
                    coherency_features, "dummy_q", max_min_stats)
    doc_name_index = create_index_to_doc_name_dict(new_features_file + "_exp")
    final_trec_file = run_chosen_model_for_stats(chosen_models, method,
                                                 new_features_file + "_exp",
                                                 doc_name_index,
                                                 new_features_file, str(beta))

    new_best_sentences = pick_best_sentences(final_trec_file, best_sentences)

    print(new_best_sentences, flush=True)
    for query in reference_docs:

        if query in banned_queries or query not in best_sentences:
            continue
        reference_doc = reference_docs[query]
        write_add_remove_file(add_remove_file, new_best_sentences, query,
                              sentences, reference_doc)
        run_reranking(reference_doc, query, labels_file, add_remove_file, beta)
Esempio n. 2
0
def create_features_for_doc_and_run_model(reference_docs, current_time,
                                          past_winners_file, doc_ids_file,
                                          model_index, index_path,
                                          method_index):
    print("loading w2v model")
    model = load_model()
    print("loading done")
    for query in reference_docs:
        print("working on", query)
        for doc in reference_docs[query]:
            print("working on", doc)
            top_docs_file, first = create_top_docs_per_ref_doc(
                current_time, doc, query)
            if first:
                continue
            print("top_doc_file is created")
            sentence_file_name, sentences_index = create_sentence_file(
                top_docs_file, doc, query, current_time)
            print("sentence_file is created")
            working_set_file = create_sentence_working_set(
                doc, current_time, sentence_file_name, query)
            print("sentence working-set is created")
            create_w2v_features(sentence_file_name, top_docs_file,
                                doc_ids_file, past_winners_file, model, query)
            print("created seo w2v features")
            create_coherency_features(sentences_index, doc, query, model)
            print("created coherency features")
            final_features_dir = "sentence_feature_files/" + current_time + "/"

            features_file = final_features_dir + query + "_" + doc + "_" + current_time
            features_dir = "sentence_feature_values/" + current_time + "/" + query + "_" + doc + "/"
            if not os.path.exists(features_dir):
                os.makedirs(features_dir)
            if not os.path.exists(final_features_dir):
                os.makedirs(final_features_dir)
            create_tfidf_features_and_features_file(
                working_set_file, features_file, features_dir, index_path,
                sentence_file_name, top_docs_file, query, past_winners_file)
            print("created tf-idf features")
            model_file = model_index[method_index[query + "-" + doc]]
            doc_name_index = create_index_to_doc_name_dict(features_file)
            print("created doc name index")
            trec_file = run_svm_model(features_file, model_file,
                                      doc_name_index, query, doc, current_time)
            print("ran seo model")
            best_comb = pick_best_sentence_pair(trec_file)
            print(best_comb)
            sentence_in, sentence_out = get_sentences_for_replacement(
                best_comb, sentences_index, doc, query)
            replace_sentences_and_save_doc(doc, query, sentence_in,
                                           sentence_out)
            print("replaced sentences")
def run_reranking(reference_doc,
                  query,
                  labels_file,
                  add_remove_file,
                  beta="-"):
    features_dir = "Features"
    feature_file = "features_" + query
    create_features_file(features_dir, params.path_to_index,
                         params.queries_xml, feature_file, add_remove_file, "")
    index_doc_name = create_index_to_doc_name_dict(feature_file)
    scores_file = run_model(feature_file)
    results = retrieve_scores(index_doc_name, scores_file)
    lists = create_lists(results)
    addition = abs(lists[query].index(reference_doc))
    labels_file.write(query + " 1 " + beta + " " + str(addition) + "\n")
Esempio n. 4
0

doc_texts = load_file(params.trec_text_file)
merged_index = ""
for index in range(1, 6):
    print("in epoch", index)
    doc_text_for_round = get_docs(doc_texts, round=index)
    trec_text_file = create_trectext_original(document_text=doc_text_for_round,
                                              summaries=[],
                                              run_name=str(index),
                                              avoid=[])
    new_index = create_index(trec_text_file, str(index))
    if merged_index:
        run_bash_command("rm -r " + merged_index)
    merged_index = merge_indices(new_index=new_index,
                                 run_name=str(index),
                                 new_index_name="merged_index")
    feature_file = "features" + "_" + str(index)
    features_dir = "Features"
    queries_file = "/home/greg/auto_seo/data/queries.xml"
    create_features_file_original(features_dir=features_dir,
                                  index_path=merged_index,
                                  new_features_file=feature_file,
                                  run_name=str(index),
                                  queries_file=queries_file)
    index_doc_name = create_index_to_doc_name_dict(feature_file)
    scores_file = run_model(feature_file, str(index))
    results = retrieve_scores(index_doc_name, scores_file)
    trec_file = create_trec_eval_file(results, str(index))
    order_trec_file(trec_file)
Esempio n. 5
0
        doc_texts[doc] = a_doc_texts[doc]

# summaries={}
# print("starting summarization")
# for query in reference_docs:
#     print("in",query )
#     sys.stdout.flush()
#     reference_doc=reference_docs[query]
#     summaries[query] = create_multi_document_summarization(ranked_lists,query,queries[query],reference_doc,params.number_of_documents_above,doc_texts,index,token2id,dic,id2df)
# print("finished summarization")
# summary_file = open("summaries","wb")
# pickle.dump(summaries,summary_file)
# summary_file.close()

# reference_docs_list = list(reference_docs.values())
# create_trectext(doc_texts,reference_docs_list,summaries)
# index_path = create_index()
# print("merging indices")
# sys.stdout.flush()
# merge_indices(index_path) features_dir,index_path,queries_file
create_features_file("Features",
                     "/lv_local/home/sgregory/auto_seo/new_merged_index",
                     "/lv_local/home/sgregory/auto_seo/data/queries.xml")
index_doc_name = create_index_to_doc_name_dict("features")
scores_file = run_model("features")

results = retrieve_scores(index_doc_name, scores_file)

results_file = open("scores_of_model", "wb")
pickle.dump(results, results_file)
results_file.close()