def create_features(raw_ds, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, queries, output_dir, qid): global word_embd_model feature_vals = {} relevant_pairs = raw_ds[qid] feature_list = ["FractionOfQueryWordsIn","FractionOfQueryWordsOut","CosineToCentroidIn","CosineToCentroidInVec","CosineToCentroidOut","CosineToCentroidOutVec","CosineToWinnerCentroidInVec","CosineToWinnerCentroidOutVec","CosineToWinnerCentroidIn","CosineToWinnerCentroidOut","SimilarityToPrev","SimilarityToRefSentence","SimilarityToPred","SimilarityToPrevRef","SimilarityToPredRef"] for feature in feature_list: feature_vals[feature]={} epoch,qid_original = reverese_query(qid) if epoch not in ["04","06"]: return past_winners = get_past_winners(ranked_lists,epoch,qid_original) past_winners_semantic_centroid_vector = past_winners_centroid(past_winners,doc_texts,word_embd_model) past_winners_tfidf_centroid_vector = get_past_winners_tfidf_centroid(past_winners,doc_tfidf_vectors_dir) top_docs = ranked_lists[epoch][qid_original][:top_doc_index] ref_doc = ranked_lists[epoch][qid_original][ref_doc_index] ref_sentences = sent_tokenize(doc_texts[ref_doc]) top_docs_tfidf_centroid = document_centroid([get_java_object(doc_tfidf_vectors_dir+doc) for doc in top_docs]) for pair in relevant_pairs: sentence_in = relevant_pairs[pair]["in"] sentence_out = relevant_pairs[pair]["out"] in_vec = get_text_centroid(clean_texts(sentence_in),word_embd_model,True) out_vec = get_text_centroid(clean_texts(sentence_out),word_embd_model,True) replace_index = int(pair.split("_")[1]) query = queries[qid] feature_vals['FractionOfQueryWordsIn'][pair] = query_term_freq("avg",clean_texts(sentence_in),clean_texts(query)) feature_vals['FractionOfQueryWordsOut'][pair] = query_term_freq("avg",clean_texts(sentence_out),clean_texts(query)) feature_vals['CosineToCentroidIn'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" + pair.split("_")[2], top_docs_tfidf_centroid) feature_vals['CosineToCentroidOut'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[0] + "_" + pair.split("_")[1], top_docs_tfidf_centroid) feature_vals["CosineToCentroidInVec"][pair] = calculate_semantic_similarity_to_top_docs(sentence_in,top_docs,doc_texts,word_embd_model) feature_vals["CosineToCentroidOutVec"][pair] = calculate_semantic_similarity_to_top_docs(sentence_out,top_docs,doc_texts,word_embd_model) feature_vals['CosineToWinnerCentroidInVec'][pair] = cosine_similarity(in_vec,past_winners_semantic_centroid_vector) feature_vals['CosineToWinnerCentroidOutVec'][pair] = cosine_similarity(out_vec,past_winners_semantic_centroid_vector) feature_vals['CosineToWinnerCentroidIn'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" + pair.split("_")[2], past_winners_tfidf_centroid_vector) feature_vals['CosineToWinnerCentroidOut'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[0] + "_" + pair.split("_")[1], past_winners_tfidf_centroid_vector) feature_vals['SimilarityToPrev'][pair]=context_similarity(replace_index,ref_sentences,sentence_in,"prev",word_embd_model) feature_vals['SimilarityToRefSentence'][pair]=context_similarity(replace_index,ref_sentences,sentence_in,"own",word_embd_model) feature_vals['SimilarityToPred'][pair]=context_similarity(replace_index,ref_sentences,sentence_in,"pred",word_embd_model) feature_vals['SimilarityToPrevRef'][pair]=context_similarity(replace_index,ref_sentences,sentence_out,"prev",word_embd_model) feature_vals['SimilarityToPredRef'][pair]=context_similarity(replace_index,ref_sentences,sentence_out,"pred",word_embd_model) write_files(feature_list,feature_vals,output_dir,qid,ref_doc_index)
def get_semantic_docs_centroid(doc_texts,doc_names,model,stemmer=None): sum_vector = None for doc in doc_names: text = doc_texts[doc] vector = get_text_centroid(clean_texts(text),model,stemmer) if sum_vector is None: sum_vector = np.zeros(vector.shape[0]) sum_vector = sum_vector+vector if sum_vector is None: return None return sum_vector/len(doc_names)
def past_winners_centroid(past_winners,texts,model,stemmer=None): sum_vector = None decay_factors = [0.01*math.exp(-0.01*(len(past_winners)-i)) for i in range(len(past_winners))] denominator = sum(decay_factors) for i,doc in enumerate(past_winners): text = texts[doc] vector = get_text_centroid(clean_texts(text),model,stemmer) if sum_vector is None: sum_vector = np.zeros(vector.shape[0]) sum_vector+=vector*decay_factors[i]/denominator return sum_vector
def context_similarity(replacement_index, ref_sentences, sentence_compared, mode, model, stemmer=None): if mode == "own": ref_sentence = ref_sentences[replacement_index] return centroid_similarity(clean_texts(ref_sentence), clean_texts(sentence_compared), model, stemmer) if mode == "pred": if replacement_index + 1 == len(ref_sentences): sentence = ref_sentences[replacement_index] else: sentence = ref_sentences[replacement_index + 1] return centroid_similarity(clean_texts(sentence), clean_texts(sentence_compared), model, stemmer) if mode == "prev": if replacement_index == 0: sentence = ref_sentences[replacement_index] else: sentence = ref_sentences[replacement_index - 1] return centroid_similarity(clean_texts(sentence), clean_texts(sentence_compared), model, stemmer)
def calculate_semantic_similarity_to_top_docs(text, top_docs, doc_texts, model,stemmer=None): summary_vector = get_text_centroid(clean_texts(text), model,stemmer) top_docs_centroid_vector = get_semantic_docs_centroid(doc_texts,top_docs,model,stemmer) return cosine_similarity(summary_vector,top_docs_centroid_vector)