sentences = retrieve_sentences(text) new_text = "" for j in range(len(sentences)): new_text += str(j + 1) + ") " + sentences[j].replace( u"\u009D", "").replace("\n", "") + " <br><br>\n" return new_text ranked_lists = retrieve_ranked_lists("ranked_lists/trec_file04") query_data = get_queries_data("topics.full.xml") reference_docs = { q: ranked_lists[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists } winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists} a_doc_texts = load_file("documents.trectext") doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04"): doc_texts[doc] = a_doc_texts[doc] sentence_map = map_set_of_sentences(doc_texts, winner_docs) rows = {} i = 1 sentence_data = {} for query in sentence_map: reference = reference_docs[query] text = doc_texts[reference][1:].replace(u"\u009D", "") sentences = [s.replace("\"", "") for s in retrieve_sentences(text)] for sentence in sentence_map[query]:
def create_sentence_similarities(stats): rows = {} model = WordToVec().load_model() ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) reference_docs = { q: ranked_lists[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists } winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists} a_doc_texts = load_file(params.trec_text_file) doc_texts = {} index = 1 for doc in a_doc_texts: if doc.__contains__("ROUND-04"): doc_texts[doc] = a_doc_texts[doc] sentence_map = map_set_of_sentences(doc_texts, winner_docs) for query in sentence_map: ref_doc = reference_docs[query] text = doc_texts[ref_doc] ref_sentences = retrieve_sentences(text) for sentence in sentence_map[query]: sentence_vec = get_sentence_vector(sentence_map[query][sentence], model=model) for i, ref_sentence in enumerate(ref_sentences): row = {} run_name = sentence + "_" + str(i + 1) if run_name not in stats: continue print("run name in stats") window = [] if i == 0: window.append(numpy.ones(300)) window.append(get_sentence_vector(ref_sentences[1], model)) elif i + 1 == len(ref_sentences): window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append(numpy.ones(300)) else: window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append( get_sentence_vector(ref_sentences[i + 1], model)) ref_vector = get_sentence_vector(ref_sentence, model) # window_dict = {} # window_dict[0]=window # window_centroid_dict,_ = get_vectors(window_dict) # window_centroid=window_centroid_dict[0] # similarity_to_window = cosine_similarity(window_centroid,sentence_vec) similarity_to_ref_sentence = cosine_similarity( ref_vector, sentence_vec) row["id"] = run_name row["similarity_to_prev"] = cosine_similarity( sentence_vec, window[0]) row["similarity_to_ref_sentence"] = similarity_to_ref_sentence row["similarity_to_pred"] = cosine_similarity( sentence_vec, window[1]) row["similarity_to_prev_ref"] = cosine_similarity( ref_vector, window[0]) row["similarity_to_pred_ref"] = cosine_similarity( ref_vector, window[1]) score = 0 if numpy.mean(stats[run_name]) > 0.5: score = 1 row["score"] = score # row["score"]=numpy.mean(stats[run_name]) rows[index] = row index += 1 return rows
} top_docs["42"] = {q: ranked_lists_new[q][:1] for q in ranked_lists_new} ranked_lists_new = retrieve_ranked_lists("trec_file06") reference_docs["65"] = { q: ranked_lists_new[q][-1].replace("EPOCH", "ROUND") for q in ranked_lists_new } top_docs["65"] = {q: ranked_lists_new[q][:3] for q in ranked_lists_new} reference_docs["62"] = { q: ranked_lists_new[q][1].replace("EPOCH", "ROUND") for q in ranked_lists_new } top_docs["62"] = {q: ranked_lists_new[q][:1] for q in ranked_lists_new} dir = "../../Crowdflower/nimo_annotations" sorted_files = sort_files_by_date(dir) tmp_doc_texts = load_file(params.trec_text_file) doc_texts = {} for doc in tmp_doc_texts: if doc.__contains__("ROUND-04") or doc.__contains__("ROUND-06"): doc_texts[doc] = tmp_doc_texts[doc] original_docs = retrieve_initial_documents() scores = {} for k in range(6): needed_file = sorted_files[k] scores = get_scores(scores, dir + "/" + needed_file, original_docs, k + 1) banned_queries = get_banned_queries(scores, reference_docs) # banned_queries = [] # rounds = ["4","6"] rounds = ["4", "6"] ranks = ["2", "5"]
def create_coherency_features(ref_index=-1, ranked_list_new_file="", doc_text_modified=""): rows = {} max_min_stats = {} model = WordToVec().load_model() ranked_lists = retrieve_ranked_lists(params.ranked_lists_file) ranked_lists_new = retrieve_ranked_lists(ranked_list_new_file) reference_docs = { q: ranked_lists[q][ref_index].replace("EPOCH", "ROUND") for q in ranked_lists } winner_docs = { q: ranked_lists_new[q] [:determine_indexes(reference_docs[q], ranked_lists_new[q])] for q in ranked_lists_new } file_to_load = params.trec_text_file f****d = [] if doc_text_modified: a_doc_texts = doc_text_modified else: a_doc_texts = load_file(file_to_load) doc_texts = {} for doc in a_doc_texts: if doc.__contains__("ROUND-04"): doc_texts[doc] = a_doc_texts[doc] sentence_map = map_set_of_sentences(doc_texts, winner_docs) for query in sentence_map: ref_doc = reference_docs[query] text = doc_texts[ref_doc] ref_sentences = retrieve_sentences(text) # if len(ref_sentences)<=2: # f****d.append(len(ref_sentences)*len(sentence_map[query])) # continue for sentence in sentence_map[query]: sentence_vec = get_sentence_vector(sentence_map[query][sentence], model=model) for i, ref_sentence in enumerate(ref_sentences): row = {} run_name = sentence + "_" + str(i + 1) window = [] if i == 0: window.append(get_sentence_vector(ref_sentences[1], model)) window.append(get_sentence_vector(ref_sentences[1], model)) elif i + 1 == len(ref_sentences): window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append( get_sentence_vector(ref_sentences[i - 1], model)) else: window.append( get_sentence_vector(ref_sentences[i - 1], model)) window.append( get_sentence_vector(ref_sentences[i + 1], model)) ref_vector = get_sentence_vector(ref_sentence, model) query = run_name.split("-")[2] row["similarity_to_prev"] = cosine_similarity( sentence_vec, window[0]) row["similarity_to_ref_sentence"] = cosine_similarity( ref_vector, sentence_vec) row["similarity_to_pred"] = cosine_similarity( sentence_vec, window[1]) row["similarity_to_prev_ref"] = cosine_similarity( ref_vector, window[0]) row["similarity_to_pred_ref"] = cosine_similarity( ref_vector, window[1]) max_min_stats = save_max_mix_stats(max_min_stats, row, query) rows[run_name] = row print("missed ", sum(f****d), "examples") return rows, max_min_stats