def summarize_docs_for_query(queries, k, m, reference_docs, doc_texts): print("summarization started") ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) summaries = {} query_processed = 0 for query in reference_docs: print("working on query:", query) summaries[query] = {} Dinit = get_Dinit_for_query(query) Dinit_counts = transform_terms_to_counts(Dinit) top_k = get_top_k_most_similar_docs_ranked_above( k, ranked_lists, query, reference_docs[query]) print("finished getting top ", k, " results for summary") for doc in top_k: query_to_doc_probability = query_probability_given_docs( queries[query], Dinit_counts) summaries[query][doc] = get_top_m_sentences( m, doc_texts[doc], Dinit_counts, query_to_doc_probability) query_processed += 1 print("out of ", len(reference_docs), " finished ", query_processed) return summaries
def write_table(method, results): f = open("summary_two_sentences_" + method + ".tex", "w") f.write("\\begin{tabular}{|c|c|}\n") f.write("\\hline\n") f.write("$\\beta$ & Average Addition \\\\ \n") f.write("\\hline\n") for beta in results: average = str( round(np.mean([results[beta][q] for q in results[beta]]), 3)) f.write(beta + " & " + average + " \\\\ \n") f.write("\\hline\n") f.write("\\end{tabular}\n") new_ranked_list = "ranked_lists/trec_file04" ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) ranked_lists_new = retrieve_ranked_lists(new_ranked_list) reference_docs = { q: ranked_lists[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists } indexes = read_labels_demotion("labels_demotion") labels = get_true_labels(indexes, ranked_lists_new, reference_docs) write_table("demotion", labels) indexes = read_labels("labels_harmonic") labels = get_true_labels(indexes, ranked_lists_new, reference_docs) write_table("harmonic", labels) indexes = read_labels("labels_weighted")
new_row["document"] = text new_rows[i] = new_row new_row["check_one_gold"] = "" return new_rows def convert_text_to_sentence_task(text): sentences = retrieve_sentences(text) new_text = "" for j in range(len(sentences)): new_text += str(j + 1) + ") " + sentences[j].replace( u"\u009D", "").replace("\n", "") + " <br><br>\n" return new_text ranked_lists = retrieve_ranked_lists("ranked_lists/trec_file04") query_data = get_queries_data("topics.full.xml") reference_docs = { q: ranked_lists[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists } winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists} a_doc_texts = load_file("documents.trectext") doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04"): doc_texts[doc] = a_doc_texts[doc] sentence_map = map_set_of_sentences(doc_texts, winner_docs) rows = {} i = 1 sentence_data = {}
print("created tf-idf features") print("creating all features") create_features_from_dir(features_dir, features_file, total_working_set_file) return features_file def write_tags(tags, filename, key): with open(filename, "a") as file: for id in tags: tag = str(min(5, sum(tags[id]))) file.write(id + key + " " + tag + "\n") if __name__ == "__main__": ranked_lists_new = retrieve_ranked_lists("trec_file04") reference_docs = {} top_docs = {} reference_docs["45"] = { q: ranked_lists_new[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists_new } top_docs["45"] = {q: ranked_lists_new[q][:3] for q in ranked_lists_new} reference_docs["42"] = { q: ranked_lists_new[q][1].replace("EPOCH", "ROUND") for q in ranked_lists_new } top_docs["42"] = {q: ranked_lists_new[q][:1] for q in ranked_lists_new} ranked_lists_new = retrieve_ranked_lists("trec_file06") reference_docs["65"] = { q: ranked_lists_new[q][-1].replace("EPOCH", "ROUND")
"\\\\ \n") f.write("\\hline\n") if j == 2: f.write(str(beta) + " & " + line) f.write("\\hline\n") else: if j == 2: f.write(method + " & " + str(beta) + " & " + line) f.write("\\hline\n") if last: f.write("\\end{tabular}\n") f.close() if __name__ == "__main__": ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) reference_docs = { q: ranked_lists[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists } dir = "nimo_annotations" sorted_files = sort_files_by_date(dir) original_docs = retrieve_initial_documents() scores = {} for k in range(4): needed_file = sorted_files[k] scores = get_scores(scores, dir + "/" + needed_file, original_docs) banned_queries = get_banned_queries(scores, reference_docs) ident_filename_fe = "figure-eight/ident_current.csv" ident_filename_mturk = "Mturk/Manipulated_Document_Identification.csv"
new_best_sentences = pick_best_sentences(final_trec_file, best_sentences) print(new_best_sentences, flush=True) for query in reference_docs: if query in banned_queries or query not in best_sentences: continue reference_doc = reference_docs[query] write_add_remove_file(add_remove_file, new_best_sentences, query, sentences, reference_doc) run_reranking(reference_doc, query, labels_file, add_remove_file, beta) if __name__ == "__main__": ranked_lists_old = retrieve_ranked_lists(params.ranked_lists_file) ranked_lists_new = retrieve_ranked_lists("ranked_lists/trec_file04") sentences = read_sentences( "/home/greg/auto_seo/SentenceRanking/sentences_add_remove") reference_docs = { q: ranked_lists_old[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists_old } initial_ranks = { q: ranked_lists_new[q].index(reference_docs[q]) + 1 for q in reference_docs } a_doc_texts = load_file(params.trec_text_file) doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04"):
def kendall_distance(ranked1, ranked2): discordant = 0 all_pairs = list(itertools.combinations(ranked1, 2)) for pair in all_pairs: winner1, loser1 = determine_order(pair, ranked1) winner2, loser2 = determine_order(pair, ranked2) if winner1 != winner2: discordant += 1 return float(discordant) / len(all_pairs) ranks_tf_idf_file = 'scores_tfidf_past' ranks_vec_file = 'scores_vec_past' ranked_lists_tfidf = retrieve_ranked_lists(ranks_tf_idf_file) ranked_lists_vec = retrieve_ranked_lists(ranks_vec_file) sum_kt = 0 sum_kt_dist = 0 sum_spearman = 0 for query in ranked_lists_tfidf: list_tf_idf = ranked_lists_tfidf[query] list_vec = ranked_lists_vec[query] kt = kendalltau(list_tf_idf, list_vec) sp = spearmanr(list_tf_idf, list_vec) kt_dist = kendall_distance(list_tf_idf, list_vec) print(kt) print(kt_dist) print(list_vec)
label = 0 else: label = max_rank - new_index return label def determine_indexes(doc, ranked_list): return min(ranked_list.index(doc), 3) if __name__ == "__main__": current_round = sys.argv[1] ref_index = sys.argv[2] addition = current_round.zfill(2) + "_" + ref_index new_ranked_list = "trec_file" + current_round ranked_lists_new = retrieve_ranked_lists(new_ranked_list) reference_docs = { q: ranked_lists_new[q][int(ref_index)].replace("EPOCH", "ROUND") for q in ranked_lists_new } new_indexes = read_labels("labels_new_" + addition) if ref_index == "-1": query_name_add = current_round + "5" else: query_name_add = current_round + "2" with open("labels_new_final_all_data", "a") as labels: for query in new_indexes: for doc in new_indexes[query]: new_index = new_indexes[query][doc] ref_doc = reference_docs[query] old_index = ranked_lists_new[query].index(ref_doc)
# print("") # f= open("summaries_1_03",'rb') # s = pickle.load(f) # f.close() # print(s["ROUND-01-195-51"]) # print("") # f= open("summaries_1_05",'rb') # s = pickle.load(f) # f.close() # print(s["ROUND-01-195-51"]) # print("") # f= open("summaries_1_08",'rb') # s = pickle.load(f) # f.close() # print(s["ROUND-01-195-51"]) ranked_lists = retrieve_ranked_lists("trec_file") runs_pagerank = [ "1_00", "1_01", "1_02", "1_03", "1_04", "1_05", "1_06", "1_07", "1_08", "1_09", "1_10" ] runs_weaving = [ "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10" ] reference_docs = {q: ranked_lists[q][-1] for q in ranked_lists} results_mean = {} results_median = {} for run in runs_pagerank: f = open("new_scores/scores_of_model_" + run, "rb") scores = pickle.load(f) new_lists = create_lists(scores) average_rank_addition_value, meadian_rank_addition_value = average_rank_addition(
from Preprocess.preprocess import retrieve_ranked_lists,load_file,retrieve_sentences import params doc_texts = load_file(params.trec_text_file) f=open("past_winners_file_new_data04","w") for run_name in range(1,4): trec_file = "/home/greg/auto_seo/data/trec_file" + str(run_name) ranked_lists = retrieve_ranked_lists(trec_file) winners = {q:ranked_lists[q][0] for q in ranked_lists} for query in ranked_lists: text = doc_texts[winners[query]].rstrip() sentences = retrieve_sentences(text) f.write(query+"45"+"@@@"+" ".join([a.replace("\n","") for a in sentences])+"\n") f.write(query+"42"+"@@@"+" ".join([a.replace("\n","") for a in sentences])+"\n") f.close() f=open("past_winners_file_new_data06","w") for run_name in range(1,6): trec_file = "/home/greg/auto_seo/data/trec_file" + str(run_name) ranked_lists = retrieve_ranked_lists(trec_file) winners = {q:ranked_lists[q][0] for q in ranked_lists} for query in ranked_lists: text = doc_texts[winners[query]].rstrip() sentences = retrieve_sentences(text) f.write(query+"65"+"@@@"+" ".join([a.replace("\n","") for a in sentences])+"\n") f.write(query+"62"+"@@@"+" ".join([a.replace("\n","") for a in sentences])+"\n") f.close() f=open("new_data_queries","w") with open("/home/greg/auto_seo/data/queris.txt") as file:
def create_sentence_similarities(stats): rows = {} model = WordToVec().load_model() ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) reference_docs = { q: ranked_lists[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists } winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists} a_doc_texts = load_file(params.trec_text_file) doc_texts = {} index = 1 for doc in a_doc_texts: if doc.__contains__("ROUND-04"): doc_texts[doc] = a_doc_texts[doc] sentence_map = map_set_of_sentences(doc_texts, winner_docs) for query in sentence_map: ref_doc = reference_docs[query] text = doc_texts[ref_doc] ref_sentences = retrieve_sentences(text) for sentence in sentence_map[query]: sentence_vec = get_sentence_vector(sentence_map[query][sentence], model=model) for i, ref_sentence in enumerate(ref_sentences): row = {} run_name = sentence + "_" + str(i + 1) if run_name not in stats: continue print("run name in stats") window = [] if i == 0: window.append(numpy.ones(300)) window.append(get_sentence_vector(ref_sentences[1], model)) elif i + 1 == len(ref_sentences): window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append(numpy.ones(300)) else: window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append( get_sentence_vector(ref_sentences[i + 1], model)) ref_vector = get_sentence_vector(ref_sentence, model) # window_dict = {} # window_dict[0]=window # window_centroid_dict,_ = get_vectors(window_dict) # window_centroid=window_centroid_dict[0] # similarity_to_window = cosine_similarity(window_centroid,sentence_vec) similarity_to_ref_sentence = cosine_similarity( ref_vector, sentence_vec) row["id"] = run_name row["similarity_to_prev"] = cosine_similarity( sentence_vec, window[0]) row["similarity_to_ref_sentence"] = similarity_to_ref_sentence row["similarity_to_pred"] = cosine_similarity( sentence_vec, window[1]) row["similarity_to_prev_ref"] = cosine_similarity( ref_vector, window[0]) row["similarity_to_pred_ref"] = cosine_similarity( ref_vector, window[1]) score = 0 if numpy.mean(stats[run_name]) > 0.5: score = 1 row["score"] = score # row["score"]=numpy.mean(stats[run_name]) rows[index] = row index += 1 return rows
def create_coherency_features(ref_index=-1, ranked_list_new_file="", doc_text_modified=""): rows = {} max_min_stats = {} model = WordToVec().load_model() ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) ranked_lists_new = retrieve_ranked_lists(ranked_list_new_file) reference_docs = { q: ranked_lists[q][ref_index].replace("EPOCH", "ROUND") for q in ranked_lists } winner_docs = { q: ranked_lists_new[q] [:determine_indexes(reference_docs[q], ranked_lists_new[q])] for q in ranked_lists_new } file_to_load = params.trec_text_file f****d = [] if doc_text_modified: a_doc_texts = doc_text_modified else: a_doc_texts = load_file(file_to_load) doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04"): doc_texts[doc] = a_doc_texts[doc] sentence_map = map_set_of_sentences(doc_texts, winner_docs) for query in sentence_map: ref_doc = reference_docs[query] text = doc_texts[ref_doc] ref_sentences = retrieve_sentences(text) # if len(ref_sentences)<=2: # f****d.append(len(ref_sentences)*len(sentence_map[query])) # continue for sentence in sentence_map[query]: sentence_vec = get_sentence_vector(sentence_map[query][sentence], model=model) for i, ref_sentence in enumerate(ref_sentences): row = {} run_name = sentence + "_" + str(i + 1) window = [] if i == 0: window.append(get_sentence_vector(ref_sentences[1], model)) window.append(get_sentence_vector(ref_sentences[1], model)) elif i + 1 == len(ref_sentences): window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append( get_sentence_vector(ref_sentences[i - 1], model)) else: window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append( get_sentence_vector(ref_sentences[i + 1], model)) ref_vector = get_sentence_vector(ref_sentence, model) query = run_name.split("-")[2] row["similarity_to_prev"] = cosine_similarity( sentence_vec, window[0]) row["similarity_to_ref_sentence"] = cosine_similarity( ref_vector, sentence_vec) row["similarity_to_pred"] = cosine_similarity( sentence_vec, window[1]) row["similarity_to_prev_ref"] = cosine_similarity( ref_vector, window[0]) row["similarity_to_pred_ref"] = cosine_similarity( ref_vector, window[1]) max_min_stats = save_max_mix_stats(max_min_stats, row, query) rows[run_name] = row print("missed ", sum(f****d), "examples") return rows, max_min_stats