コード例 #1
0
def key_rank(text, topk=18):
    sents = list(cut_sentence(text))
    docs = [list(Tokenize(sent)) for sent in sents]
    keyword_rank = textrank.KeywordTextRank(docs)
    keyword_rank.solve()
    keys = [w for w in keyword_rank.top_index(topk)]
    return keys
コード例 #2
0
ファイル: summarize.py プロジェクト: jannson/Similar
def key_rank(text, topk=18):
    sents = list(cut_sentence(text))
    docs = [list(tokenize2(sent)) for sent in sents]
    keyword_rank = textrank.KeywordTextRank(docs)
    keyword_rank.solve()
    keys = [w for w in keyword_rank.top_index(topk)]
    return keys
コード例 #3
0
ファイル: summ.py プロジェクト: jannson/Similar
def summarize(txt):
    sents = list(cppjiebapy.cut_sentence(txt))
    docs = [list(Tokenize(sent)) for sent in sents]
    keys = []
    for d in docs:
        keys += d
    (top_n_words, cls) = key_words(keys, N_2)
    top_n_sum = summarize4(sents, docs)

    return ", ".join(top_n_words[:18]), top_n_sum, cls
コード例 #4
0
ファイル: summarize.py プロジェクト: jannson/Similar
def sum_rank(text):
    sents = list(cut_sentence(text))
    docs = [list(tokenize2(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = textrank.TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
コード例 #5
0
def sum_rank(text):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
コード例 #6
0
ファイル: summ.py プロジェクト: jannson/Similar
def summarize(txt):
    sents = list(cppjiebapy.cut_sentence(txt))
    docs = [list(Tokenize(sent)) for sent in sents]
    keys = []
    for d in docs:
        keys += d
    (top_n_words, cls) = key_words(keys, N_2)
    top_n_sum = summarize4(sents, docs)

    return ', '.join(top_n_words[:18]), top_n_sum, cls
コード例 #7
0
def test_rank2(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    print 'test_rank2', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
コード例 #8
0
ファイル: summ_test.py プロジェクト: jannson/Similar
def test3():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    bm25 = BM25(docs)
    l = len(sents)
    test_dense = np.zeros((l,l))
    for i in xrange(l):
        scores = bm25.simall(docs[i])
        test_dense[i] = scores
    print 'bm25=', test_dense
コード例 #9
0
def test3():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    bm25 = BM25(docs)
    l = len(sents)
    test_dense = np.zeros((l, l))
    for i in xrange(l):
        scores = bm25.simall(docs[i])
        test_dense[i] = scores
    print 'bm25=', test_dense
コード例 #10
0
ファイル: summ_test.py プロジェクト: jannson/Similar
def text_rank4(obj):
    sents = list(cut_sentence(obj.content))
    vect = TfidfVectorizer(min_df=1,tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    tfidf_graph = tfidf*tfidf.T
    #print 's ', tfidf_graph.A
    #print 'o ', np.dot(tfidf.A,tfidf.A.T)
    nx_graph = nx.from_scipy_sparse_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i],i) for i,s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _,i in sorted(res[:3])]
    print 'text_rank4', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
コード例 #11
0
def test1():
    obj = HtmlContent.objects.get(pk=46)
    key, sum, cls = summarize(obj.content)
    key2 = key_rank(obj.content)

    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sum2 = summarize4(sents)

    print 'key1', key, cls
    print 'key2', ', '.join(key2)
    print 'sum1', sum
    print 'sum2', sum2
コード例 #12
0
def text_rank4(obj):
    sents = list(cut_sentence(obj.content))
    vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    tfidf_graph = tfidf * tfidf.T
    #print 's ', tfidf_graph.A
    #print 'o ', np.dot(tfidf.A,tfidf.A.T)
    nx_graph = nx.from_scipy_sparse_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _, i in sorted(res[:3])]
    print 'text_rank4', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
コード例 #13
0
ファイル: summ_test.py プロジェクト: jannson/Similar
def test2():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a,test_a)), dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res,(l,l))
    print 'lsi=', sim_res
コード例 #14
0
ファイル: summ_test.py プロジェクト: jannson/Similar
def test1():
    obj = HtmlContent.objects.get(pk=46)
    key,sum,cls = summarize(obj.content)
    key2 = key_rank(obj.content)

    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sum2 = summarize4(sents)

    print 'key1', key, cls
    print 'key2',', '.join(key2)
    print 'sum1',sum
    print 'sum2',sum2
コード例 #15
0
ファイル: summ_test.py プロジェクト: jannson/Similar
def test_rank3(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    vect = TfidfVectorizer(min_df=1,tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    lsa = TruncatedSVD(5)
    lsa_res = lsa.fit_transform(tfidf)
    lsa_res = Normalizer(copy=False).fit_transform(lsa_res)
    tfidf_graph = np.dot(lsa_res,lsa_res.T)
    tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph))
    nx_graph = nx.from_numpy_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i],i) for i,s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _,i in sorted(res[:3])]
    print 'test_rank3', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
コード例 #16
0
ファイル: summ.py プロジェクト: jannson/Similar
def summarize3(txt):
    sentences = []
    for s in cppjiebapy.cut_sentence(txt):
        sentences.append(s.lower())
    normalized_sentences = [s.lower() for s in sentences]

    (top_n_words, _) = key_words(txt, N_3)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words)
    avg_list = [s[1] for s in scored_sentences]
    avg = np.mean(avg_list)
    std = np.std(avg_list)
    # avg,std = _mean_std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std]
    mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored]
    return u"。 ".join(mean_scored_summary) + u"。 "
コード例 #17
0
def test2():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance,
                                         itertools.product(test_a, test_a)),
                          dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res, (l, l))
    print 'lsi=', sim_res
コード例 #18
0
def test_rank3(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    lsa = TruncatedSVD(5)
    lsa_res = lsa.fit_transform(tfidf)
    lsa_res = Normalizer(copy=False).fit_transform(lsa_res)
    tfidf_graph = np.dot(lsa_res, lsa_res.T)
    tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph))
    nx_graph = nx.from_numpy_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _, i in sorted(res[:3])]
    print 'test_rank3', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
コード例 #19
0
ファイル: summ.py プロジェクト: jannson/Similar
def summarize3(txt):
    sentences = []
    for s in cppjiebapy.cut_sentence(txt):
        sentences.append(s.lower())
    normalized_sentences = [s.lower() for s in sentences]

    (top_n_words, _) = key_words(txt, N_3)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words)
    avg_list = [s[1] for s in scored_sentences]
    avg = np.mean(avg_list)
    std = np.std(avg_list)
    #avg,std = _mean_std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]
    mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored]
    return u'。 '.join(mean_scored_summary) + u'。 '
コード例 #20
0
ファイル: summ_test.py プロジェクト: jannson/Similar
def test_rank1(obj):
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a,test_a)), dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res,(l,l))
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in rank.top_index(5):
        top_n_summary.append(sents[index])
    print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')
コード例 #21
0
def test_rank1(obj):
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance,
                                         itertools.product(test_a, test_a)),
                          dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res, (l, l))
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in rank.top_index(5):
        top_n_summary.append(sents[index])
    print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '')
コード例 #22
0
def test5():
    for obj in HtmlContent.objects.filter(~Q(content='')):
        sents = list(cut_sentence(obj.content))
        docs = [list(Tokenize(sent)) for sent in sents]
        obj.summerize = summarize4(sents, docs)[0:400]
        obj.save()
コード例 #23
0
ファイル: summ_test.py プロジェクト: jannson/Similar
def test5():
    for obj in HtmlContent.objects.filter(~Q(content='')):
        sents = list(cut_sentence(obj.content))
        docs = [list(Tokenize(sent)) for sent in sents]
        obj.summerize = summarize4(sents, docs)[0:400]
        obj.save()