コード例 #1
0
def eval_avgs_against_descriptions(embedding=None):
    """
    Compute ROUGE-L and cosine similarity evaluation measures where ideal
    summaries are taken as the TED Talk descriptions.

    :param embedding: fastText word embedding.
    :return: List containing averages of precision, recall, F-score, and cosine
        similarity over 50 documents.
    :rtype: list(float)
    """
    # Get records where one or more tag is present in transcript.
    df = read_data(75)
    df = drop_noconcept_recs(df)[:50]

    results = []
    for j in range(len(df)):
        s = summarize(df.iloc[[j]], df['d_cnt'].iloc[[j]][j])
        ideal = preprocess_transcripts(df.iloc[[j]],
                                       clean_transcript_data=False,
                                       df_field='description')
        rl = Evaluation.rouge_l(s, ideal[0][0])
        cs = Evaluation.cos_similarity(s, ideal[0][0])
        results.append([rl, cs])

    # Average evaluation scores over number of dataframe records.
    results = np.asarray(results)
    rlresults = results[:, 0]
    cossim_results = results[:, 1]
    avg_prec = np.average([rlresults[j][0] for j in range(results.shape[0])])
    avg_recall = np.average([rlresults[j][1] for j in range(results.shape[0])])
    avg_fscore = np.average([rlresults[j][2] for j in range(results.shape[0])])
    avg_cossim = np.average(cossim_results)

    return [avg_prec, avg_recall, avg_fscore, avg_cossim]
コード例 #2
0
def eval_against_humangenerated(method, embedding=None):
    """
    Compute ROUGE-L and cosine similarity evaluation measures for first five
    records where ideal summaries are human generated.

    :param method: LSA or TextRank summarization method.
    :param embedding: fastText word embedding.
    :return results: List containing evalution measure computations.
    :rtype: list(array_type): float
    """
    human_summaries = [
        ("It's never happened before in software! Remember, the "
         "hard part is not deciding what features to add, it's "
         "The lesson was: simplicity sells."),
        ("This is where I realized that there was really a need to communicate, "
         "because the data of what's happening in the world and the child "
         "health of every country is very well aware."
         "Now, statisticians don't like it, because they say that this will not "
         "show the reality; we have to have statistical, analytical methods. "
         "And it's a new technology coming in, but then amazingly, how well it "
         "fits to the economy of the countries."),
        ("And the interesting thing is: if you do it for love, the money comes "
         "anyway. 'To be successful, put your nose down in something and get "
         "damn good at it.' Persistence is the number one reason for our success."
         ),
        ("So honeybees are important for their role in the economy as well as "
         "in agriculture. We need bees for the future of our cities and urban "
         "living. What can you do to save the bees or to help them or to think "
         "of sustainable cities in the future?"),
        ("So now I want to introduce you to my new hero in the global climate "
         "change war, and that is the eastern oyster. So the oyster was the "
         "basis for a manifesto-like urban design project that I did about the "
         "New York Harbor called oyster-tecture. To conclude, this is just one "
         "cross-section of one piece of city, but my dream is, my hope is, that "
         "when you all go back to your own cities that we can start to work "
         "together and collaborate on remaking and reforming a new urban "
         "landscape towards a more sustainable, a more livable and a more "
         "delicious future.")
    ]
    df = read_data(5)
    results = []
    for j in range(len(df)):
        s = method.summarize_text(df.iloc[[j]], 3)
        rl = Evaluation.rouge_l(s, human_summaries[j])
        cs = Evaluation.cos_similarity(s, human_summaries[j])
        results.append([rl, cs])
    return results