Ejemplo n.º 1
0
def load_docsets(duc_dir):
    docset_paths = [os.path.join(duc_dir, fname) for fname in os.listdir(duc_dir)]
    docset_paths = [path for path in docset_paths if os.path.isdir(path)]
    docsets = {}
    for docset_path in docset_paths:
        print("\n"+docset_path)
        text = load_docset(docset_path)
        textDoc = []
        for dom in text:
            for sentence in dom.sentences[1:]:
                textDoc.append(sentence.__unicode__())
        x = ' '.join(textDoc)
        x = re.sub('\'\'','', x)
        x = re.sub('``','', x)
        x = re.sub('<SLUG>','', x)
        x = re.sub('</SLUG>','', x)
        y = PlaintextParser.from_string(x, Tokenizer(LANGUAGE))
        summary = summarizer(y.document, SENTENCES_COUNT)
        folder_name = docset_path.split('/')[-1]
        names = folder_name[:-1] + '.M.250.' + folder_name[-1]
        paths = [name + char for name, char in zip([names] * 10, ['.A', '.B', '.C', '.D', '.E', '.F', '.G', '.H', '.I', '.J']) ]
        # print(paths)
        for path in paths:
            try:
                # print(path)
                groundTruth = PlaintextParser.from_file(GtPath + path, Tokenizer(LANGUAGE))
                res.append(rouge_1(summary, groundTruth.document.sentences))
                print(res[-1])
            except:
                # print('exp on')
                # print(path)
                pass
Ejemplo n.º 2
0
def calc_value(eval_sentences, ref_sentences):
    n_1 = rouge_1(eval_sentences, ref_sentences)
    n_2 = rouge_2(eval_sentences, ref_sentences)
    n_3 = rouge_n(eval_sentences, ref_sentences, 3)

    print('n1 ' + str(n_1) + '\n')
    print('n2 ' + str(n_2) + '\n')
    print('n3 ' + str(n_3) + '\n')
    print('avg ' + str((n_1 + n_2 + n_3) / 3))

    return (n_1 + n_2 + n_3) / 3
Ejemplo n.º 3
0
def calculate_rouge_1_score(summary):
    scores = []
    average = 0
    k = 0
    ideal_summary_1 = 'Jason saw a nice weather forecast and went to the beach with his kids for 2 hours.'
    ideal_summary_2 = 'Jason took the kids swimming at the beach on a sunny day.'
    ideal_summary_3 = 'Jason decided to take the kids to the beach since it was a sunny day.'
    ideal_summaries = [ideal_summary_1, ideal_summary_2, ideal_summary_3]
    tokenizer = Tokenizer('english')
    summary_sentence = Sentence(summary, tokenizer)
    for s in ideal_summaries:
        ideal_sentence = Sentence(ideal_summaries[k], tokenizer)
        scores.append(rouge_1([summary_sentence], [ideal_sentence]))
        average += scores[k]
        k += 1
    scores.append(average / 3)
    return scores
Ejemplo n.º 4
0
from sumy.utils import get_stop_words
from sumy.evaluation.rouge import rouge_1

LANGUAGE = "english"
SENTENCES_COUNT = 10

if __name__ == "__main__":
    #url = "https://en.wikipedia.org/wiki/Pinterest"
    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # or for plain text files
    # doc = 'document.txt'
    doc = '/Users/Apple/Documents/git-tamu/irProject/DUC_data/DUC2006/duc2006_docs/D0601A/APW19990707.0181'
    parser = PlaintextParser.from_file(doc, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    res = summarizer(parser.document, SENTENCES_COUNT)
    # print(res)

    doc1 = "/Users/Apple/Documents/git-tamu/irProject/DUC_data/DUC2006/NISTeval/ROUGE/peers/D0601.M.250.A.E"
    parser_ground = PlaintextParser.from_file(doc1, Tokenizer(LANGUAGE))
    # print(parser_ground.document.sentences)

    #for sentence in summarizer(parser.document, SENTENCES_COUNT):
    #    print(sentence)

    print(rouge_1(res, parser_ground.document.sentences))
Ejemplo n.º 5
0
def main():
    print("\n\t\t SUMMARIZATION REVIEW\t\t\n")
    print('[INFO] Loading configuration')
    with open("./config.yml", 'r') as file:
        config_var = safe_load(file)["main"]

    data = load_clean_data(path_to_file=str(config_var['dataset_folder']) + "/" +
                       str(config_var['data_to_use']))
    #
    #
    #
    print("[INFO] Training sentence tokenizer for summary on all articles.")
    punkt_tokenizer = PunktSentenceTokenizer(
    train_text="\n".join([sent for sent in data["Paragraphs_as_string"]])
                                            )
    #
    #
    #
    len_sum = np.mean(data["Summary"].apply(lambda x: len(punkt_tokenizer.tokenize(x))))
    print("[INFO] Average number of sentences in article summaries", len_sum)
    print("[COMMENT] Considering this value as reference to generate the automatic summaries.")
    len_sum = int(len_sum)
    #
    #
    #
    print("[INFO] Using "+str(config_var['language'])+"stenner")
    stemmer = Stemmer(config_var['language'])
    print("[INFO] Preparing summarizers")
    summarizer_dict = {"LSA":LsaSummarizer(stemmer),
                        "Luhn": LuhnSummarizer(stemmer),
                        "LexRank":LexRankSummarizer(stemmer),
                        "SumBasics":SumBasicSummarizer(stemmer),
                        "Random": RandomSummarizer(stemmer),
                        "Reduction": ReductionSummarizer(stemmer)}

    print("[INFO] Preparing stopwords.")
    for summarizer in summarizer_dict.values():
        summarizer.stop_words = get_stop_words('english')

    print("[INFO] Summaries preparation")
    dict_res = {}
    dict_summs = {}
    for name, summarizer in summarizer_dict.items():
        print("[INFO] Method:", name)
        results_rouge_1 = []
        results_rouge_2 = []
        results_rouge_l_1 = []
        results_rouge_l_2 = []
        sums = {}
        for i in progressbar.progressbar(range(len(data))):
            (article, summary) = (data["Paragraphs_as_string"][i],
                                  data["Summary"][i])

            parser = PlaintextParser.from_string(
                article, tokenizer=Tokenizer('english'))

            summaries = [
                sentence for sentence in summarizer(parser.document, len_sum)
            ]
            summaries_str = [
                str(sentence) for sentence in summarizer(parser.document, len_sum)
            ]
            # Append current summary results
            # Since there are problems with some documents
            # being skipped, I need to save the index as well
            sums[i] = (" ".join(summaries_str))

            #     To use sumy's evaluation functions, I need to have the text in
            #     Sentence objects
            reference_sentences = [
                Sentence(sent, tokenizer=Tokenizer("english"))
                for sent in punkt_tokenizer.tokenize(summary)
            ]
            try:
                results_rouge_1.append(
                    rouge_1(evaluated_sentences=summaries,
                            reference_sentences=reference_sentences))
            except:
                results_rouge_1.append(np.nan)
            try:
                results_rouge_2.append(
                    rouge_2(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_2.append(np.nan)

            try:
                results_rouge_l_1.append(
                    rouge_l_sentence_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_1.append(np.nan)

            try:
                results_rouge_l_2.append(
                    rouge_l_summary_level(evaluated_sentences=summaries,
                                          reference_sentences=reference_sentences))
            except:
                # print("[ERROR] Some problem occurd in the rouge_L (summary level) calculation. This is most likely caused by sentences too short in the summary. No workaround has been found for this: the value will be set to NA.")
                results_rouge_l_2.append(np.nan)
    #         Save results and progress to next summarizer
        dict_res[name] = {
            "Rouge_1": results_rouge_1,
            "Rouge_2": results_rouge_2,
            "Rouge_L_sentence_level": results_rouge_l_1,
            "Rouge_L_summary_level": results_rouge_l_2
        }
        # Save summaries to dictionary
        dict_summs[name] = sums
    print("[INFO] Summaries and evaluations completed.")
    print("[INFO] Saving data to output.")
    # Create pandas dataframe for mean of results
    res_mean = pd.DataFrame(columns = dict_res.keys())
    # Dataframe for std of results
    res_se = pd.DataFrame(columns = dict_res.keys())
    for col in res_mean:
        res_mean[col] = pd.Series(
            {key: np.nanmean(value)
             for key, value in dict_res[col].items()})
        res_se[col] = pd.Series(
            {key: np.nanstd(value)/np.sqrt(len(value))
             for key, value in dict_res[col].items()})

    print("[INFO] Saving evaluation averages.")
    with open(config_var['output_folder']+"/avgs.csv", 'w') as file:
        res_mean.to_csv(file)
    print("[INFO] Saving evaluations standard errors.")
    with open(config_var['output_folder']+"/ses.csv", 'w') as file:
        res_se.to_csv(file)
    print("[INFO] Saving to json all produced summaries.")
    with open(config_var['output_folder']+"/summaries.json", 'w') as file:
        json.dump(dict_summs, file)
    print("[INFO] Program completed successfully.")