コード例 #1
0
def compute(topic):
    raw, ref = get_article(topic)

    sent = tokenize(raw)

    df = pd.DataFrame()

    ratio = len(ref) / len(raw)

    # TextRank
    result = text_rank(raw, sent, ref)

    r = Rouge()
    rouge = r.get_scores(result, ref)

    df = df.append(gen_serie('TextRank', rouge, result), ignore_index=True)

    # Gensim
    ret = summarize(raw, ratio)
    r = Rouge()
    rouge = r.get_scores(ret, ref)
    df = df.append(gen_serie('Gensim', rouge, ret), ignore_index=True)

    # KMean
    df = df.append(kmean(sent, ret))

    # Cosine
    df = df.append(cosine(sent, ref), ignore_index=True)

    # Rearrange columns
    df = df[columns]

    df.to_csv('out/' + topic + '.csv')

    return df.to_json(orient='records')
コード例 #2
0
def compute(topic):
    raw, ref = get_article(topic)

    sent = tokenize(raw)

    df = pd.DataFrame()

    ratio = len(ref) / len(raw) #the proportion of the number of sentences of the original text to be chosen for the summary.

    # TextRank
    result = text_rank(raw, sent, ref)

    r = Rouge()
    rouge = r.get_scores(result, ref)

    df = df.append(gen_serie('TextRank', rouge, result), ignore_index=True)

    # Gensim --- based on ranks of text sentences using a variation of the TextRank algorithm
    ret = summarize(raw, ratio)
    r = Rouge()
    rouge = r.get_scores(ret, ref)
    df = df.append(gen_serie('Gensim', rouge, ret), ignore_index=True)

    # KMean
    df = df.append(kmean(sent, ret))

    # Cosine
    df = df.append(cosine(sent, ref), ignore_index=True)

    # Rearrange columns
    df = df[columns]

    df.to_csv('out/' + topic + '.csv')

    return df.to_json(orient='records')
コード例 #3
0
def kmean(text, ref):
    df = pd.DataFrame()

    for i in range(2, 11):
        res = cluster(text, ref, i)
        r = Rouge()
        rouge = r.get_scores(' '.join(res), ref)
        df = df.append(gen_serie('K-mean-' + str(i), rouge, res),
                       ignore_index=True)

    return df
コード例 #4
0
def cosine(texts, ref):
    vec = TfidfVectorizer(tokenizer=textblob_tokenizer,
                          stop_words='english',
                          use_idf=True)
    matrix = vec.fit_transform(texts)

    cosine_similarities = cosine_similarity(matrix[0:1], matrix).flatten()

    nb_sentences_in_base_summary = len(ref.split('.'))

    cosine_similarities = list(cosine_similarities)
    cos_results = []
    for i in range(0, nb_sentences_in_base_summary):
        n = cosine_similarities.index(max(cosine_similarities))
        cos_results.append(texts[n])
        del cosine_similarities[n]

    res = ' '.join(cos_results)

    r = Rouge()
    rouge = r.get_scores(res, ref)

    return gen_serie('Cosine Similarity', rouge, res)