Esempio n. 1
0
def compare_frequecies(summary_stats_file):
    # stats={"sentence":{},"summary":{},"success":{}}
    stats = {"sentence": {}, "summary": {}}
    occ_stats = {"sentence": {}, "summary": {}}

    with open(summary_stats_file, encoding="utf-8") as file:
        for line in file:
            query = line.split("\t")[0]
            doc = line.split("\t")[1]
            epoch = int(doc.split("-")[1])
            for k in stats:
                if epoch not in stats[k]:
                    stats[k][epoch] = []
                    if k in occ_stats:
                        occ_stats[k][epoch] = []
            sentence = line.split("\t")[2]
            summary = line.split("\t")[3]
            sentence_qtf = query_term_freq("avg", clean_texts(sentence), query)
            stats["sentence"][epoch].append(sentence_qtf)
            occ_stats["sentence"][epoch].append(
                query_term_occ("sum", clean_texts(sentence), query))
            summary_qtf = query_term_freq("avg", clean_texts(summary), query)
            stats["summary"][epoch].append(summary_qtf)
            occ_stats["summary"][epoch].append(
                query_term_occ("sum", clean_texts(summary), query))
    for k in stats:
        for epoch in stats[k]:
            stats[k][epoch] = np.mean(stats[k][epoch])
            if k in occ_stats:
                occ_stats[k][epoch] = np.mean(occ_stats[k][epoch])
    return stats, occ_stats
Esempio n. 2
0
def doc_frequency_eval(lists, queries, texts):
    stats = {"top": {}, "reference": {}, "next": {}}
    for epoch in lists:
        for k in stats:
            stats[k][epoch] = []
        for query in lists[epoch]:
            query_text = queries[query]
            top_docs = lists[epoch][query][:3]
            ref_doc = lists[epoch][query][-1]
            next_doc = lists[epoch][query][-2]
            stats["top"][epoch].append(
                np.mean([
                    query_term_freq("avg", clean_texts(texts[doc]), query_text)
                    for doc in top_docs
                ]))
            stats["reference"][epoch].append(
                query_term_freq("avg", clean_texts(texts[ref_doc]),
                                query_text))
            stats["next"][epoch].append(
                query_term_freq("avg", clean_texts(texts[next_doc]),
                                query_text))
    for k in stats:
        for epoch in stats[k]:
            stats[k][epoch] = np.mean(stats[k][epoch])
    return stats
def choose_highest_rank_summary(chosen_idxs, summaries, document_text,
                                replacement_index):
    for idx in chosen_idxs:
        summary = summaries[idx]
        sentences = nltk.sent_tokenize(document_text)
        sentences[replacement_index] = summary
        new_text = " ".join(sentences)
        if len(clean_texts(new_text).split()) <= 150:
            return summary
    return None
Esempio n. 4
0
def context_similarity(replacement_index,
                       ref_sentences,
                       sentence_compared,
                       mode,
                       model,
                       stemmer=None):
    if mode == "own":
        ref_sentence = ref_sentences[replacement_index]
        return centroid_similarity(clean_texts(ref_sentence),
                                   clean_texts(sentence_compared), model,
                                   stemmer)
    if mode == "pred":
        if replacement_index + 1 == len(ref_sentences):
            sentence = ref_sentences[replacement_index]
        else:
            sentence = ref_sentences[replacement_index + 1]
        return centroid_similarity(clean_texts(sentence),
                                   clean_texts(sentence_compared), model,
                                   stemmer)
    if mode == "prev":
        if replacement_index == 0:
            sentence = ref_sentences[replacement_index]
        else:
            sentence = ref_sentences[replacement_index - 1]
        return centroid_similarity(clean_texts(sentence),
                                   clean_texts(sentence_compared), model,
                                   stemmer)
Esempio n. 5
0
def write_input_dataset_file(replacements,reference_docs,texts,suffix):
    input_dir = 'input_data/'
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)
    with open(input_dir+"senteces_for_replace_"+suffix+".txt",'w') as file:
        file.write("query\tdocname\tsentence_index\tsentence\n")
        for query in replacements:
            index = replacements[query]
            docname = reference_docs[query]
            text = texts[docname]
            sentence = clean_texts(sent_tokenize(text)[index])
            file.write("\t".join([query,docname,str(index),sentence])+'\n')
    return input_dir+"senteces_for_replace_"+suffix+".txt"
Esempio n. 6
0
def past_winners_centroid(past_winners, texts, model, stemmer=None):
    sum_vector = None
    decay_factors = [
        0.01 * math.exp(-0.01 * (len(past_winners) - i))
        for i in range(len(past_winners))
    ]
    denominator = sum(decay_factors)
    for i, doc in enumerate(past_winners):
        text = texts[doc]
        vector = get_text_centroid(clean_texts(text), model, stemmer)
        if sum_vector is None:
            sum_vector = np.zeros(vector.shape[0])
        sum_vector += vector * decay_factors[i] / denominator
    return sum_vector
def update_texts(doc_texts, pairs_ranked_lists, sentence_data):
    new_texts = {}
    for qid in pairs_ranked_lists:
        for chosen_pair in pairs_ranked_lists[qid]:
            ref_doc = chosen_pair.split("_")[0]
            replacement_index = int(chosen_pair.split("_")[1])
            sentence_in = sentence_data[qid][chosen_pair]["in"]
            sentences = sent_tokenize(doc_texts[ref_doc])
            sentences[replacement_index] = sentence_in
            new_text = "\n".join(sentences)
            if len(clean_texts(deepcopy(new_text)).split()) > 150:
                continue

            new_texts[ref_doc] = new_text
            break
    for doc in doc_texts:
        if doc not in new_texts:
            new_texts[doc] = doc_texts[doc]
    return new_texts
def calcualte_former_documents(current_epoch, qid, document_texts):
    former_docs = []
    seen_texts = []
    for doc in document_texts:
        query = doc.split("-")[2]
        epoch = int(doc.split("-")[1])
        if epoch == 0:
            continue
        if qid != query:
            continue
        if epoch >= current_epoch:
            continue
        doc_text = document_texts[doc]
        cleaned_text = clean_texts(doc_text)
        cleaned_text = cleaned_text.replace(" ", "")
        if cleaned_text in seen_texts:
            continue
        former_docs.append(doc)
        seen_texts.append(cleaned_text)
    return former_docs
Esempio n. 9
0
def create_features(raw_ds, ranked_lists, doc_texts, top_doc_index,
                    ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir,
                    queries, output_dir, qid):
    global word_embd_model
    feature_vals = {}
    relevant_pairs = raw_ds[qid]
    feature_list = [
        "FractionOfQueryWordsIn", "FractionOfQueryWordsOut",
        "CosineToCentroidIn", "CosineToCentroidInVec", "CosineToCentroidOut",
        "CosineToCentroidOutVec", "CosineToWinnerCentroidInVec",
        "CosineToWinnerCentroidOutVec", "CosineToWinnerCentroidIn",
        "CosineToWinnerCentroidOut", "SimilarityToPrev",
        "SimilarityToRefSentence", "SimilarityToPred", "SimilarityToPrevRef",
        "SimilarityToPredRef"
    ]

    for feature in feature_list:
        feature_vals[feature] = {}

    epoch, qid_original = reverese_query(qid)
    if epoch not in ["04", "06"]:
        return
    past_winners = get_past_winners(ranked_lists, epoch, qid_original)
    past_winners_semantic_centroid_vector = past_winners_centroid(
        past_winners, doc_texts, word_embd_model)
    past_winners_tfidf_centroid_vector = get_past_winners_tfidf_centroid(
        past_winners, doc_tfidf_vectors_dir)
    top_docs = ranked_lists[epoch][qid_original][:top_doc_index]
    ref_doc = ranked_lists[epoch][qid_original][ref_doc_index]
    ref_sentences = sent_tokenize(doc_texts[ref_doc])
    top_docs_tfidf_centroid = document_centroid(
        [get_java_object(doc_tfidf_vectors_dir + doc) for doc in top_docs])
    for pair in relevant_pairs:
        sentence_in = relevant_pairs[pair]["in"]
        sentence_out = relevant_pairs[pair]["out"]
        in_vec = get_text_centroid(clean_texts(sentence_in), word_embd_model,
                                   True)
        out_vec = get_text_centroid(clean_texts(sentence_out), word_embd_model,
                                    True)
        replace_index = int(pair.split("_")[1])
        query = queries[qid]

        feature_vals['FractionOfQueryWordsIn'][pair] = query_term_freq(
            "avg", clean_texts(sentence_in), clean_texts(query))
        feature_vals['FractionOfQueryWordsOut'][pair] = query_term_freq(
            "avg", clean_texts(sentence_out), clean_texts(query))
        feature_vals['CosineToCentroidIn'][
            pair] = calculate_similarity_to_docs_centroid_tf_idf(
                tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" +
                pair.split("_")[2], top_docs_tfidf_centroid)
        feature_vals['CosineToCentroidOut'][
            pair] = calculate_similarity_to_docs_centroid_tf_idf(
                tfidf_sentence_dir + pair.split("$")[0] + "_" +
                pair.split("_")[1], top_docs_tfidf_centroid)
        feature_vals["CosineToCentroidInVec"][
            pair] = calculate_semantic_similarity_to_top_docs(
                sentence_in, top_docs, doc_texts, word_embd_model)
        feature_vals["CosineToCentroidOutVec"][
            pair] = calculate_semantic_similarity_to_top_docs(
                sentence_out, top_docs, doc_texts, word_embd_model)
        feature_vals['CosineToWinnerCentroidInVec'][pair] = cosine_similarity(
            in_vec, past_winners_semantic_centroid_vector)
        feature_vals['CosineToWinnerCentroidOutVec'][pair] = cosine_similarity(
            out_vec, past_winners_semantic_centroid_vector)
        feature_vals['CosineToWinnerCentroidIn'][
            pair] = calculate_similarity_to_docs_centroid_tf_idf(
                tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" +
                pair.split("_")[2], past_winners_tfidf_centroid_vector)
        feature_vals['CosineToWinnerCentroidOut'][
            pair] = calculate_similarity_to_docs_centroid_tf_idf(
                tfidf_sentence_dir + pair.split("$")[0] + "_" +
                pair.split("_")[1], past_winners_tfidf_centroid_vector)
        feature_vals['SimilarityToPrev'][pair] = context_similarity(
            replace_index, ref_sentences, sentence_in, "prev", word_embd_model)
        feature_vals['SimilarityToRefSentence'][pair] = context_similarity(
            replace_index, ref_sentences, sentence_in, "own", word_embd_model)
        feature_vals['SimilarityToPred'][pair] = context_similarity(
            replace_index, ref_sentences, sentence_in, "pred", word_embd_model)
        feature_vals['SimilarityToPrevRef'][pair] = context_similarity(
            replace_index, ref_sentences, sentence_out, "prev",
            word_embd_model)
        feature_vals['SimilarityToPredRef'][pair] = context_similarity(
            replace_index, ref_sentences, sentence_out, "pred",
            word_embd_model)
    write_files(feature_list, feature_vals, output_dir, qid, ref_doc_index)
Esempio n. 10
0
def cover(text, query):
    numerator = 0
    for q in query.split():
        if q in clean_texts(text):
            numerator += 1
    return numerator / len(query.split())
Esempio n. 11
0
def create_features(summaries_dir, ranked_lists, doc_texts, top_doc_index,
                    ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir,
                    summary_tfidf_dir, queries, output_dir, qid):
    global word_embd_model
    feature_vals = {}
    feature_list = [
        "FractionOfQueryWordsIn", "FractionOfQueryWordsOut",
        "CosineToCentroidIn", "CosineToCentroidInVec", "CosineToCentroidOut",
        "CosineToCentroidOutVec", "CosineToWinnerCentroidInVec",
        "CosineToWinnerCentroidOutVec", "CosineToWinnerCentroidIn",
        "CosineToWinnerCentroidOut", "SimilarityToPrev",
        "SimilarityToRefSentence", "SimilarityToPred", "SimilarityToPrevRef",
        "SimilarityToPredRef"
    ]

    for feature in feature_list:
        feature_vals[feature] = {}

    epoch, qid_original = reverese_query(qid)
    query = queries[qid]
    past_winners = get_past_winners(ranked_lists, epoch, qid_original)
    past_winners_semantic_centroid_vector = past_winners_centroid(
        past_winners, doc_texts, word_embd_model)
    past_winners_tfidf_centroid_vector = get_past_winners_tfidf_centroid(
        past_winners, doc_tfidf_vectors_dir)
    top_docs = ranked_lists[epoch][qid_original][:top_doc_index]
    ref_doc = ranked_lists[epoch][qid_original][ref_doc_index]
    ref_sentences = sent_tokenize(doc_texts[ref_doc])
    top_docs_tfidf_centroid = document_centroid(
        [get_java_object(doc_tfidf_vectors_dir + doc) for doc in top_docs])
    with open(summaries_dir + "_".join(query.split())) as summaries:
        for i, sentence_out in enumerate(ref_sentences):
            for j, sentence_in in enumerate(summaries):
                pair = ref_doc + "_" + str(i) + "_" + str(j)
                replace_index = i
                in_vec = get_text_centroid(clean_texts(sentence_in),
                                           word_embd_model, True)
                out_vec = get_text_centroid(clean_texts(sentence_out),
                                            word_embd_model, True)
                feature_vals['FractionOfQueryWordsIn'][pair] = query_term_freq(
                    "avg", clean_texts(sentence_in), clean_texts(query))
                feature_vals['FractionOfQueryWordsOut'][
                    pair] = query_term_freq("avg", clean_texts(sentence_out),
                                            clean_texts(query))

                feature_vals['CosineToCentroidIn'][
                    pair] = calculate_similarity_to_docs_centroid_tf_idf(
                        summary_tfidf_dir + pair, top_docs_tfidf_centroid)
                feature_vals['CosineToCentroidOut'][
                    pair] = calculate_similarity_to_docs_centroid_tf_idf(
                        tfidf_sentence_dir + pair.split("_")[0] + "_" +
                        pair.split("_")[1], top_docs_tfidf_centroid)

                feature_vals["CosineToCentroidInVec"][
                    pair] = calculate_semantic_similarity_to_top_docs(
                        sentence_in, top_docs, doc_texts, word_embd_model,
                        True)
                feature_vals["CosineToCentroidOutVec"][
                    pair] = calculate_semantic_similarity_to_top_docs(
                        sentence_out, top_docs, doc_texts, word_embd_model,
                        True)

                feature_vals['CosineToWinnerCentroidInVec'][
                    pair] = cosine_similarity(
                        in_vec, past_winners_semantic_centroid_vector)
                feature_vals['CosineToWinnerCentroidOutVec'][
                    pair] = cosine_similarity(
                        out_vec, past_winners_semantic_centroid_vector)
                feature_vals['CosineToWinnerCentroidIn'][
                    pair] = calculate_similarity_to_docs_centroid_tf_idf(
                        summary_tfidf_dir + pair,
                        past_winners_tfidf_centroid_vector)
                feature_vals['CosineToWinnerCentroidOut'][
                    pair] = calculate_similarity_to_docs_centroid_tf_idf(
                        tfidf_sentence_dir + pair.split("_")[0] + "_" +
                        pair.split("_")[1], past_winners_tfidf_centroid_vector)

                feature_vals['SimilarityToPrev'][pair] = context_similarity(
                    replace_index, ref_sentences, sentence_in, "prev",
                    word_embd_model, True)
                feature_vals['SimilarityToRefSentence'][
                    pair] = context_similarity(replace_index, ref_sentences,
                                               sentence_in, "own",
                                               word_embd_model, True)
                feature_vals['SimilarityToPred'][pair] = context_similarity(
                    replace_index, ref_sentences, sentence_in, "pred",
                    word_embd_model, True)
                feature_vals['SimilarityToPrevRef'][pair] = context_similarity(
                    replace_index, ref_sentences, sentence_out, "prev",
                    word_embd_model, True)
                feature_vals['SimilarityToPredRef'][pair] = context_similarity(
                    replace_index, ref_sentences, sentence_out, "pred",
                    word_embd_model, True)

    write_files(feature_list, feature_vals, output_dir, qid)
Esempio n. 12
0
from summarization.seo_experiment.borda_mechanism import query_term_freq, query_term_occ
from summarization.seo_experiment.utils import clean_texts
from summarization.seo_experiment.workingset_creator import read_queries_file
from summarization.seo_experiment.summarization_process import transform_query_text
import nltk
import numpy as np

queries = read_queries_file("../data/queries.xml")
queries = transform_query_text(queries)

summary_access = open("top_docs_summaries.txt")
summary_data_access = open("summarization_data.txt")
summaries = summary_access.readlines()
data_points = summary_data_access.readlines()

freqs = {"all": [], "first": []}
for i, summary in enumerate(summaries):
    data = data_points[i]
    qid = data.split("\t")[1]
    q_text = queries[qid]
    fixed_sum = summary.replace("<t>", "").replace("</t>", "").replace(
        ", .", ".").replace(". .", ".")
    freqs["all"].append(query_term_occ("sum", clean_texts(fixed_sum), q_text))
    first = nltk.sent_tokenize(fixed_sum)[0]
    freqs["first"].append(query_term_occ("sum", clean_texts(first), q_text))

for k in freqs:
    freqs[k] = np.mean(freqs[k])

print(freqs)