Ejemplo n.º 1
0
def key_rank(text, topk=18):
    sents = list(cut_sentence(text))
    docs = [list(Tokenize(sent)) for sent in sents]
    keyword_rank = textrank.KeywordTextRank(docs)
    keyword_rank.solve()
    keys = [w for w in keyword_rank.top_index(topk)]
    return keys
Ejemplo n.º 2
0
def key_rank(text, topk=18):
    sents = list(cut_sentence(text))
    docs = [list(tokenize2(sent)) for sent in sents]
    keyword_rank = textrank.KeywordTextRank(docs)
    keyword_rank.solve()
    keys = [w for w in keyword_rank.top_index(topk)]
    return keys
Ejemplo n.º 3
0
def summarize(txt):
    sents = list(cppjiebapy.cut_sentence(txt))
    docs = [list(Tokenize(sent)) for sent in sents]
    keys = []
    for d in docs:
        keys += d
    (top_n_words, cls) = key_words(keys, N_2)
    top_n_sum = summarize4(sents, docs)

    return ", ".join(top_n_words[:18]), top_n_sum, cls
Ejemplo n.º 4
0
def sum_rank(text):
    sents = list(cut_sentence(text))
    docs = [list(tokenize2(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = textrank.TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
Ejemplo n.º 5
0
def sum_rank(text):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
Ejemplo n.º 6
0
def summarize(txt):
    sents = list(cppjiebapy.cut_sentence(txt))
    docs = [list(Tokenize(sent)) for sent in sents]
    keys = []
    for d in docs:
        keys += d
    (top_n_words, cls) = key_words(keys, N_2)
    top_n_sum = summarize4(sents, docs)

    return ', '.join(top_n_words[:18]), top_n_sum, cls
Ejemplo n.º 7
0
def test_rank2(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    print 'test_rank2', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
Ejemplo n.º 8
0
def test3():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    bm25 = BM25(docs)
    l = len(sents)
    test_dense = np.zeros((l,l))
    for i in xrange(l):
        scores = bm25.simall(docs[i])
        test_dense[i] = scores
    print 'bm25=', test_dense
Ejemplo n.º 9
0
def test3():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    bm25 = BM25(docs)
    l = len(sents)
    test_dense = np.zeros((l, l))
    for i in xrange(l):
        scores = bm25.simall(docs[i])
        test_dense[i] = scores
    print 'bm25=', test_dense
Ejemplo n.º 10
0
def text_rank4(obj):
    sents = list(cut_sentence(obj.content))
    vect = TfidfVectorizer(min_df=1,tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    tfidf_graph = tfidf*tfidf.T
    #print 's ', tfidf_graph.A
    #print 'o ', np.dot(tfidf.A,tfidf.A.T)
    nx_graph = nx.from_scipy_sparse_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i],i) for i,s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _,i in sorted(res[:3])]
    print 'text_rank4', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
Ejemplo n.º 11
0
def test1():
    obj = HtmlContent.objects.get(pk=46)
    key, sum, cls = summarize(obj.content)
    key2 = key_rank(obj.content)

    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sum2 = summarize4(sents)

    print 'key1', key, cls
    print 'key2', ', '.join(key2)
    print 'sum1', sum
    print 'sum2', sum2
Ejemplo n.º 12
0
def text_rank4(obj):
    sents = list(cut_sentence(obj.content))
    vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    tfidf_graph = tfidf * tfidf.T
    #print 's ', tfidf_graph.A
    #print 'o ', np.dot(tfidf.A,tfidf.A.T)
    nx_graph = nx.from_scipy_sparse_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _, i in sorted(res[:3])]
    print 'text_rank4', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
Ejemplo n.º 13
0
def test2():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a,test_a)), dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res,(l,l))
    print 'lsi=', sim_res
Ejemplo n.º 14
0
def test1():
    obj = HtmlContent.objects.get(pk=46)
    key,sum,cls = summarize(obj.content)
    key2 = key_rank(obj.content)

    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    sum2 = summarize4(sents)

    print 'key1', key, cls
    print 'key2',', '.join(key2)
    print 'sum1',sum
    print 'sum2',sum2
Ejemplo n.º 15
0
def test_rank3(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    vect = TfidfVectorizer(min_df=1,tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    lsa = TruncatedSVD(5)
    lsa_res = lsa.fit_transform(tfidf)
    lsa_res = Normalizer(copy=False).fit_transform(lsa_res)
    tfidf_graph = np.dot(lsa_res,lsa_res.T)
    tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph))
    nx_graph = nx.from_numpy_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i],i) for i,s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _,i in sorted(res[:3])]
    print 'test_rank3', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')+u'。'
Ejemplo n.º 16
0
def summarize3(txt):
    sentences = []
    for s in cppjiebapy.cut_sentence(txt):
        sentences.append(s.lower())
    normalized_sentences = [s.lower() for s in sentences]

    (top_n_words, _) = key_words(txt, N_3)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words)
    avg_list = [s[1] for s in scored_sentences]
    avg = np.mean(avg_list)
    std = np.std(avg_list)
    # avg,std = _mean_std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std]
    mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored]
    return u"。 ".join(mean_scored_summary) + u"。 "
Ejemplo n.º 17
0
def test2():
    obj = HtmlContent.objects.get(pk=34)
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    #test_sparse = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance,
                                         itertools.product(test_a, test_a)),
                          dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res, (l, l))
    print 'lsi=', sim_res
Ejemplo n.º 18
0
def test_rank3(obj):
    sents = list(cut_sentence(obj.content))
    docs = [list(Tokenize(sent)) for sent in sents]
    vect = TfidfVectorizer(min_df=1, tokenizer=Tokenize)
    tfidf = vect.fit_transform(sents)
    lsa = TruncatedSVD(5)
    lsa_res = lsa.fit_transform(tfidf)
    lsa_res = Normalizer(copy=False).fit_transform(lsa_res)
    tfidf_graph = np.dot(lsa_res, lsa_res.T)
    tfidf_graph = abs(Normalizer(copy=False).fit_transform(tfidf_graph))
    nx_graph = nx.from_numpy_matrix(tfidf_graph)
    scores = nx.pagerank(nx_graph)
    res = sorted(((scores[i], i) for i, s in enumerate(sents)), reverse=True)
    top_n_summary = [sents[i] for _, i in sorted(res[:3])]
    print 'test_rank3', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '') + u'。'
Ejemplo n.º 19
0
def summarize3(txt):
    sentences = []
    for s in cppjiebapy.cut_sentence(txt):
        sentences.append(s.lower())
    normalized_sentences = [s.lower() for s in sentences]

    (top_n_words, _) = key_words(txt, N_3)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words)
    avg_list = [s[1] for s in scored_sentences]
    avg = np.mean(avg_list)
    std = np.std(avg_list)
    #avg,std = _mean_std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]
    mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored]
    return u'。 '.join(mean_scored_summary) + u'。 '
Ejemplo n.º 20
0
def test_rank1(obj):
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance, itertools.product(test_a,test_a)), dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res,(l,l))
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in rank.top_index(5):
        top_n_summary.append(sents[index])
    print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r','').replace('\n','')
Ejemplo n.º 21
0
def test_rank1(obj):
    sents = list(cut_sentence(obj.content))
    docs = [dictionary.doc2bow(list(Tokenize(sent))) for sent in sents]
    num_terms = 400
    test_corpus = lsi_model[tfidf_model[docs]]
    test_dense = matutils.corpus2dense(test_corpus, num_terms).T
    test_a = [a for a in test_dense]
    sim_res = np.fromiter(itertools.imap(cos_distance,
                                         itertools.product(test_a, test_a)),
                          dtype=np.float)
    l = len(sents)
    sim_res = np.reshape(sim_res, (l, l))
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in rank.top_index(5):
        top_n_summary.append(sents[index])
    print 'test_rank1 ', u'。 '.join(top_n_summary).replace('\r', '').replace(
        '\n', '')
Ejemplo n.º 22
0
def test5():
    for obj in HtmlContent.objects.filter(~Q(content='')):
        sents = list(cut_sentence(obj.content))
        docs = [list(Tokenize(sent)) for sent in sents]
        obj.summerize = summarize4(sents, docs)[0:400]
        obj.save()
Ejemplo n.º 23
0
def test5():
    for obj in HtmlContent.objects.filter(~Q(content='')):
        sents = list(cut_sentence(obj.content))
        docs = [list(Tokenize(sent)) for sent in sents]
        obj.summerize = summarize4(sents, docs)[0:400]
        obj.save()