Beispiel #1
0
def summarize_docs_for_query(queries, k, m, reference_docs, doc_texts):
    print("summarization started")
    ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
    summaries = {}
    query_processed = 0
    for query in reference_docs:
        print("working on query:", query)
        summaries[query] = {}
        Dinit = get_Dinit_for_query(query)
        Dinit_counts = transform_terms_to_counts(Dinit)
        top_k = get_top_k_most_similar_docs_ranked_above(
            k, ranked_lists, query, reference_docs[query])
        print("finished getting top ", k, " results for summary")
        for doc in top_k:
            query_to_doc_probability = query_probability_given_docs(
                queries[query], Dinit_counts)
            summaries[query][doc] = get_top_m_sentences(
                m, doc_texts[doc], Dinit_counts, query_to_doc_probability)
        query_processed += 1
        print("out of ", len(reference_docs), " finished ", query_processed)
    return summaries
Beispiel #2
0
def write_table(method, results):
    f = open("summary_two_sentences_" + method + ".tex", "w")
    f.write("\\begin{tabular}{|c|c|}\n")
    f.write("\\hline\n")
    f.write("$\\beta$ & Average Addition \\\\ \n")
    f.write("\\hline\n")
    for beta in results:
        average = str(
            round(np.mean([results[beta][q] for q in results[beta]]), 3))
        f.write(beta + " & " + average + " \\\\ \n")
        f.write("\\hline\n")
    f.write("\\end{tabular}\n")


new_ranked_list = "ranked_lists/trec_file04"
ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
ranked_lists_new = retrieve_ranked_lists(new_ranked_list)
reference_docs = {
    q: ranked_lists[q][-1].replace("EPOCH", "ROUND")
    for q in ranked_lists
}

indexes = read_labels_demotion("labels_demotion")
labels = get_true_labels(indexes, ranked_lists_new, reference_docs)
write_table("demotion", labels)

indexes = read_labels("labels_harmonic")
labels = get_true_labels(indexes, ranked_lists_new, reference_docs)
write_table("harmonic", labels)

indexes = read_labels("labels_weighted")
Beispiel #3
0
        new_row["document"] = text
        new_rows[i] = new_row
        new_row["check_one_gold"] = ""
    return new_rows


def convert_text_to_sentence_task(text):
    sentences = retrieve_sentences(text)
    new_text = ""
    for j in range(len(sentences)):
        new_text += str(j + 1) + ") " + sentences[j].replace(
            u"\u009D", "").replace("\n", "") + " <br><br>\n"
    return new_text


ranked_lists = retrieve_ranked_lists("ranked_lists/trec_file04")
query_data = get_queries_data("topics.full.xml")
reference_docs = {
    q: ranked_lists[q][-1].replace("EPOCH", "ROUND")
    for q in ranked_lists
}
winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists}
a_doc_texts = load_file("documents.trectext")
doc_texts = {}
for doc in a_doc_texts:
    if doc.__contains__("ROUND-04"):
        doc_texts[doc] = a_doc_texts[doc]
sentence_map = map_set_of_sentences(doc_texts, winner_docs)
rows = {}
i = 1
sentence_data = {}
            print("created tf-idf features")
    print("creating all features")
    create_features_from_dir(features_dir, features_file,
                             total_working_set_file)
    return features_file


def write_tags(tags, filename, key):
    with open(filename, "a") as file:
        for id in tags:
            tag = str(min(5, sum(tags[id])))
            file.write(id + key + " " + tag + "\n")


if __name__ == "__main__":
    ranked_lists_new = retrieve_ranked_lists("trec_file04")
    reference_docs = {}
    top_docs = {}
    reference_docs["45"] = {
        q: ranked_lists_new[q][-1].replace("EPOCH", "ROUND")
        for q in ranked_lists_new
    }
    top_docs["45"] = {q: ranked_lists_new[q][:3] for q in ranked_lists_new}
    reference_docs["42"] = {
        q: ranked_lists_new[q][1].replace("EPOCH", "ROUND")
        for q in ranked_lists_new
    }
    top_docs["42"] = {q: ranked_lists_new[q][:1] for q in ranked_lists_new}
    ranked_lists_new = retrieve_ranked_lists("trec_file06")
    reference_docs["65"] = {
        q: ranked_lists_new[q][-1].replace("EPOCH", "ROUND")
                            "\\\\ \n")
                    f.write("\\hline\n")
                if j == 2:
                    f.write(str(beta) + " & " + line)
                    f.write("\\hline\n")
            else:
                if j == 2:
                    f.write(method + " & " + str(beta) + " & " + line)
                    f.write("\\hline\n")
        if last:
            f.write("\\end{tabular}\n")
        f.close()


if __name__ == "__main__":
    ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
    reference_docs = {
        q: ranked_lists[q][-1].replace("EPOCH", "ROUND")
        for q in ranked_lists
    }
    dir = "nimo_annotations"
    sorted_files = sort_files_by_date(dir)

    original_docs = retrieve_initial_documents()
    scores = {}
    for k in range(4):
        needed_file = sorted_files[k]
        scores = get_scores(scores, dir + "/" + needed_file, original_docs)
    banned_queries = get_banned_queries(scores, reference_docs)
    ident_filename_fe = "figure-eight/ident_current.csv"
    ident_filename_mturk = "Mturk/Manipulated_Document_Identification.csv"
    new_best_sentences = pick_best_sentences(final_trec_file, best_sentences)

    print(new_best_sentences, flush=True)
    for query in reference_docs:

        if query in banned_queries or query not in best_sentences:
            continue
        reference_doc = reference_docs[query]
        write_add_remove_file(add_remove_file, new_best_sentences, query,
                              sentences, reference_doc)
        run_reranking(reference_doc, query, labels_file, add_remove_file, beta)


if __name__ == "__main__":
    ranked_lists_old = retrieve_ranked_lists(params.ranked_lists_file)
    ranked_lists_new = retrieve_ranked_lists("ranked_lists/trec_file04")
    sentences = read_sentences(
        "/home/greg/auto_seo/SentenceRanking/sentences_add_remove")
    reference_docs = {
        q: ranked_lists_old[q][-1].replace("EPOCH", "ROUND")
        for q in ranked_lists_old
    }
    initial_ranks = {
        q: ranked_lists_new[q].index(reference_docs[q]) + 1
        for q in reference_docs
    }
    a_doc_texts = load_file(params.trec_text_file)
    doc_texts = {}
    for doc in a_doc_texts:
        if doc.__contains__("ROUND-04"):
def kendall_distance(ranked1, ranked2):
    discordant = 0
    all_pairs = list(itertools.combinations(ranked1, 2))
    for pair in all_pairs:
        winner1, loser1 = determine_order(pair, ranked1)
        winner2, loser2 = determine_order(pair, ranked2)
        if winner1 != winner2:
            discordant += 1
    return float(discordant) / len(all_pairs)


ranks_tf_idf_file = 'scores_tfidf_past'
ranks_vec_file = 'scores_vec_past'

ranked_lists_tfidf = retrieve_ranked_lists(ranks_tf_idf_file)
ranked_lists_vec = retrieve_ranked_lists(ranks_vec_file)

sum_kt = 0
sum_kt_dist = 0
sum_spearman = 0
for query in ranked_lists_tfidf:
    list_tf_idf = ranked_lists_tfidf[query]
    list_vec = ranked_lists_vec[query]
    kt = kendalltau(list_tf_idf, list_vec)
    sp = spearmanr(list_tf_idf, list_vec)

    kt_dist = kendall_distance(list_tf_idf, list_vec)
    print(kt)
    print(kt_dist)
    print(list_vec)
Beispiel #8
0
        label = 0
    else:
        label = max_rank - new_index
    return label


def determine_indexes(doc, ranked_list):
    return min(ranked_list.index(doc), 3)


if __name__ == "__main__":
    current_round = sys.argv[1]
    ref_index = sys.argv[2]
    addition = current_round.zfill(2) + "_" + ref_index
    new_ranked_list = "trec_file" + current_round
    ranked_lists_new = retrieve_ranked_lists(new_ranked_list)
    reference_docs = {
        q: ranked_lists_new[q][int(ref_index)].replace("EPOCH", "ROUND")
        for q in ranked_lists_new
    }
    new_indexes = read_labels("labels_new_" + addition)
    if ref_index == "-1":
        query_name_add = current_round + "5"
    else:
        query_name_add = current_round + "2"
    with open("labels_new_final_all_data", "a") as labels:
        for query in new_indexes:
            for doc in new_indexes[query]:
                new_index = new_indexes[query][doc]
                ref_doc = reference_docs[query]
                old_index = ranked_lists_new[query].index(ref_doc)
Beispiel #9
0
# print("")
# f= open("summaries_1_03",'rb')
# s = pickle.load(f)
# f.close()
# print(s["ROUND-01-195-51"])
# print("")
# f= open("summaries_1_05",'rb')
# s = pickle.load(f)
# f.close()
# print(s["ROUND-01-195-51"])
# print("")
# f= open("summaries_1_08",'rb')
# s = pickle.load(f)
# f.close()
# print(s["ROUND-01-195-51"])
ranked_lists = retrieve_ranked_lists("trec_file")
runs_pagerank = [
    "1_00", "1_01", "1_02", "1_03", "1_04", "1_05", "1_06", "1_07", "1_08",
    "1_09", "1_10"
]
runs_weaving = [
    "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"
]
reference_docs = {q: ranked_lists[q][-1] for q in ranked_lists}
results_mean = {}
results_median = {}
for run in runs_pagerank:
    f = open("new_scores/scores_of_model_" + run, "rb")
    scores = pickle.load(f)
    new_lists = create_lists(scores)
    average_rank_addition_value, meadian_rank_addition_value = average_rank_addition(
from Preprocess.preprocess import retrieve_ranked_lists,load_file,retrieve_sentences
import params

doc_texts = load_file(params.trec_text_file)

f=open("past_winners_file_new_data04","w")
for run_name in range(1,4):
    trec_file = "/home/greg/auto_seo/data/trec_file" + str(run_name)
    ranked_lists = retrieve_ranked_lists(trec_file)
    winners = {q:ranked_lists[q][0] for q in ranked_lists}
    for query in ranked_lists:
        text = doc_texts[winners[query]].rstrip()
        sentences = retrieve_sentences(text)
        f.write(query+"45"+"@@@"+" ".join([a.replace("\n","")  for a in sentences])+"\n")
        f.write(query+"42"+"@@@"+" ".join([a.replace("\n","")  for a in sentences])+"\n")
f.close()

f=open("past_winners_file_new_data06","w")
for run_name in range(1,6):
    trec_file = "/home/greg/auto_seo/data/trec_file" + str(run_name)
    ranked_lists = retrieve_ranked_lists(trec_file)
    winners = {q:ranked_lists[q][0] for q in ranked_lists}
    for query in ranked_lists:
        text = doc_texts[winners[query]].rstrip()
        sentences = retrieve_sentences(text)
        f.write(query+"65"+"@@@"+" ".join([a.replace("\n","")  for a in sentences])+"\n")
        f.write(query+"62"+"@@@"+" ".join([a.replace("\n","")  for a in sentences])+"\n")
f.close()

f=open("new_data_queries","w")
with open("/home/greg/auto_seo/data/queris.txt") as file:
Beispiel #11
0
def create_sentence_similarities(stats):
    rows = {}
    model = WordToVec().load_model()
    ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
    reference_docs = {
        q: ranked_lists[q][-1].replace("EPOCH", "ROUND")
        for q in ranked_lists
    }
    winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists}
    a_doc_texts = load_file(params.trec_text_file)
    doc_texts = {}
    index = 1
    for doc in a_doc_texts:
        if doc.__contains__("ROUND-04"):
            doc_texts[doc] = a_doc_texts[doc]
    sentence_map = map_set_of_sentences(doc_texts, winner_docs)
    for query in sentence_map:
        ref_doc = reference_docs[query]

        text = doc_texts[ref_doc]
        ref_sentences = retrieve_sentences(text)
        for sentence in sentence_map[query]:

            sentence_vec = get_sentence_vector(sentence_map[query][sentence],
                                               model=model)
            for i, ref_sentence in enumerate(ref_sentences):
                row = {}
                run_name = sentence + "_" + str(i + 1)
                if run_name not in stats:
                    continue
                print("run name in stats")
                window = []
                if i == 0:
                    window.append(numpy.ones(300))
                    window.append(get_sentence_vector(ref_sentences[1], model))

                elif i + 1 == len(ref_sentences):
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(numpy.ones(300))
                else:
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(
                        get_sentence_vector(ref_sentences[i + 1], model))

                ref_vector = get_sentence_vector(ref_sentence, model)
                # window_dict = {}
                # window_dict[0]=window
                # window_centroid_dict,_ = get_vectors(window_dict)
                # window_centroid=window_centroid_dict[0]
                # similarity_to_window = cosine_similarity(window_centroid,sentence_vec)
                similarity_to_ref_sentence = cosine_similarity(
                    ref_vector, sentence_vec)
                row["id"] = run_name
                row["similarity_to_prev"] = cosine_similarity(
                    sentence_vec, window[0])
                row["similarity_to_ref_sentence"] = similarity_to_ref_sentence
                row["similarity_to_pred"] = cosine_similarity(
                    sentence_vec, window[1])
                row["similarity_to_prev_ref"] = cosine_similarity(
                    ref_vector, window[0])
                row["similarity_to_pred_ref"] = cosine_similarity(
                    ref_vector, window[1])
                score = 0
                if numpy.mean(stats[run_name]) > 0.5:
                    score = 1
                row["score"] = score
                # row["score"]=numpy.mean(stats[run_name])
                rows[index] = row
                index += 1
    return rows
Beispiel #12
0
def create_coherency_features(ref_index=-1,
                              ranked_list_new_file="",
                              doc_text_modified=""):
    rows = {}
    max_min_stats = {}
    model = WordToVec().load_model()
    ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
    ranked_lists_new = retrieve_ranked_lists(ranked_list_new_file)
    reference_docs = {
        q: ranked_lists[q][ref_index].replace("EPOCH", "ROUND")
        for q in ranked_lists
    }
    winner_docs = {
        q: ranked_lists_new[q]
        [:determine_indexes(reference_docs[q], ranked_lists_new[q])]
        for q in ranked_lists_new
    }
    file_to_load = params.trec_text_file
    f****d = []
    if doc_text_modified:
        a_doc_texts = doc_text_modified
    else:
        a_doc_texts = load_file(file_to_load)
    doc_texts = {}
    for doc in a_doc_texts:
        if doc.__contains__("ROUND-04"):
            doc_texts[doc] = a_doc_texts[doc]
    sentence_map = map_set_of_sentences(doc_texts, winner_docs)
    for query in sentence_map:
        ref_doc = reference_docs[query]

        text = doc_texts[ref_doc]
        ref_sentences = retrieve_sentences(text)
        # if len(ref_sentences)<=2:
        #     f****d.append(len(ref_sentences)*len(sentence_map[query]))
        #     continue
        for sentence in sentence_map[query]:

            sentence_vec = get_sentence_vector(sentence_map[query][sentence],
                                               model=model)
            for i, ref_sentence in enumerate(ref_sentences):
                row = {}
                run_name = sentence + "_" + str(i + 1)
                window = []
                if i == 0:
                    window.append(get_sentence_vector(ref_sentences[1], model))
                    window.append(get_sentence_vector(ref_sentences[1], model))

                elif i + 1 == len(ref_sentences):
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                else:
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(
                        get_sentence_vector(ref_sentences[i + 1], model))
                ref_vector = get_sentence_vector(ref_sentence, model)
                query = run_name.split("-")[2]
                row["similarity_to_prev"] = cosine_similarity(
                    sentence_vec, window[0])
                row["similarity_to_ref_sentence"] = cosine_similarity(
                    ref_vector, sentence_vec)
                row["similarity_to_pred"] = cosine_similarity(
                    sentence_vec, window[1])
                row["similarity_to_prev_ref"] = cosine_similarity(
                    ref_vector, window[0])
                row["similarity_to_pred_ref"] = cosine_similarity(
                    ref_vector, window[1])
                max_min_stats = save_max_mix_stats(max_min_stats, row, query)
                rows[run_name] = row
    print("missed ", sum(f****d), "examples")
    return rows, max_min_stats