Ejemplo n.º 1
0
def create_features(raw_ds, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, queries, output_dir, qid):
    global word_embd_model
    feature_vals = {}
    relevant_pairs = raw_ds[qid]
    feature_list = ["FractionOfQueryWordsIn","FractionOfQueryWordsOut","CosineToCentroidIn","CosineToCentroidInVec","CosineToCentroidOut","CosineToCentroidOutVec","CosineToWinnerCentroidInVec","CosineToWinnerCentroidOutVec","CosineToWinnerCentroidIn","CosineToWinnerCentroidOut","SimilarityToPrev","SimilarityToRefSentence","SimilarityToPred","SimilarityToPrevRef","SimilarityToPredRef"]

    for feature in feature_list:
        feature_vals[feature]={}

    epoch,qid_original = reverese_query(qid)
    if epoch not in ["04","06"]:
        return
    past_winners = get_past_winners(ranked_lists,epoch,qid_original)
    past_winners_semantic_centroid_vector = past_winners_centroid(past_winners,doc_texts,word_embd_model)
    past_winners_tfidf_centroid_vector = get_past_winners_tfidf_centroid(past_winners,doc_tfidf_vectors_dir)
    top_docs = ranked_lists[epoch][qid_original][:top_doc_index]
    ref_doc = ranked_lists[epoch][qid_original][ref_doc_index]
    ref_sentences = sent_tokenize(doc_texts[ref_doc])
    top_docs_tfidf_centroid = document_centroid([get_java_object(doc_tfidf_vectors_dir+doc) for doc in top_docs])
    for pair in relevant_pairs:
        sentence_in = relevant_pairs[pair]["in"]
        sentence_out = relevant_pairs[pair]["out"]
        in_vec = get_text_centroid(clean_texts(sentence_in),word_embd_model,True)
        out_vec = get_text_centroid(clean_texts(sentence_out),word_embd_model,True)
        replace_index = int(pair.split("_")[1])
        query = queries[qid]

        feature_vals['FractionOfQueryWordsIn'][pair] = query_term_freq("avg",clean_texts(sentence_in),clean_texts(query))
        feature_vals['FractionOfQueryWordsOut'][pair] = query_term_freq("avg",clean_texts(sentence_out),clean_texts(query))
        feature_vals['CosineToCentroidIn'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" + pair.split("_")[2], top_docs_tfidf_centroid)
        feature_vals['CosineToCentroidOut'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[0] + "_" + pair.split("_")[1], top_docs_tfidf_centroid)
        feature_vals["CosineToCentroidInVec"][pair] = calculate_semantic_similarity_to_top_docs(sentence_in,top_docs,doc_texts,word_embd_model)
        feature_vals["CosineToCentroidOutVec"][pair] = calculate_semantic_similarity_to_top_docs(sentence_out,top_docs,doc_texts,word_embd_model)
        feature_vals['CosineToWinnerCentroidInVec'][pair] = cosine_similarity(in_vec,past_winners_semantic_centroid_vector)
        feature_vals['CosineToWinnerCentroidOutVec'][pair] = cosine_similarity(out_vec,past_winners_semantic_centroid_vector)
        feature_vals['CosineToWinnerCentroidIn'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" + pair.split("_")[2], past_winners_tfidf_centroid_vector)
        feature_vals['CosineToWinnerCentroidOut'][pair] = calculate_similarity_to_docs_centroid_tf_idf(tfidf_sentence_dir + pair.split("$")[0] + "_" + pair.split("_")[1], past_winners_tfidf_centroid_vector)
        feature_vals['SimilarityToPrev'][pair]=context_similarity(replace_index,ref_sentences,sentence_in,"prev",word_embd_model)
        feature_vals['SimilarityToRefSentence'][pair]=context_similarity(replace_index,ref_sentences,sentence_in,"own",word_embd_model)
        feature_vals['SimilarityToPred'][pair]=context_similarity(replace_index,ref_sentences,sentence_in,"pred",word_embd_model)
        feature_vals['SimilarityToPrevRef'][pair]=context_similarity(replace_index,ref_sentences,sentence_out,"prev",word_embd_model)
        feature_vals['SimilarityToPredRef'][pair]=context_similarity(replace_index,ref_sentences,sentence_out,"pred",word_embd_model)
    write_files(feature_list,feature_vals,output_dir,qid,ref_doc_index)
def get_semantic_docs_centroid(doc_texts,doc_names,model,stemmer=None):
    sum_vector = None
    for doc in doc_names:
        text = doc_texts[doc]
        vector = get_text_centroid(clean_texts(text),model,stemmer)
        if sum_vector is None:
            sum_vector = np.zeros(vector.shape[0])
        sum_vector = sum_vector+vector
    if sum_vector is None:
        return None
    return sum_vector/len(doc_names)
Ejemplo n.º 3
0
def past_winners_centroid(past_winners,texts,model,stemmer=None):
    sum_vector = None
    decay_factors = [0.01*math.exp(-0.01*(len(past_winners)-i)) for i in range(len(past_winners))]
    denominator = sum(decay_factors)
    for i,doc in enumerate(past_winners):
        text = texts[doc]
        vector = get_text_centroid(clean_texts(text),model,stemmer)
        if sum_vector is None:
            sum_vector = np.zeros(vector.shape[0])
        sum_vector+=vector*decay_factors[i]/denominator
    return sum_vector
Ejemplo n.º 4
0
def context_similarity(replacement_index,
                       ref_sentences,
                       sentence_compared,
                       mode,
                       model,
                       stemmer=None):
    if mode == "own":
        ref_sentence = ref_sentences[replacement_index]
        return centroid_similarity(clean_texts(ref_sentence),
                                   clean_texts(sentence_compared), model,
                                   stemmer)
    if mode == "pred":
        if replacement_index + 1 == len(ref_sentences):
            sentence = ref_sentences[replacement_index]
        else:
            sentence = ref_sentences[replacement_index + 1]
        return centroid_similarity(clean_texts(sentence),
                                   clean_texts(sentence_compared), model,
                                   stemmer)
    if mode == "prev":
        if replacement_index == 0:
            sentence = ref_sentences[replacement_index]
        else:
            sentence = ref_sentences[replacement_index - 1]
        return centroid_similarity(clean_texts(sentence),
                                   clean_texts(sentence_compared), model,
                                   stemmer)
def calculate_semantic_similarity_to_top_docs(text, top_docs, doc_texts, model,stemmer=None):
    summary_vector = get_text_centroid(clean_texts(text), model,stemmer)
    top_docs_centroid_vector = get_semantic_docs_centroid(doc_texts,top_docs,model,stemmer)
    return cosine_similarity(summary_vector,top_docs_centroid_vector)