Example #1
0
    def clck_summary(self):
        sents = pdf_to_text(self.file_name)

        textrank = TextRank(language=self.language,
                            tokenizer=None,
                            stopwords=STOPWORDS)
        keysents = textrank.summarize(sents, topk=5)

        self.ui.textBrowser.setText("\n".join(keysents))
Example #2
0
def summarize4(sents, docs=None):
    if not docs:
        docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
Example #3
0
def summarize4(sents, docs=None):
    if not docs:
        docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u"。 ".join(top_n_summary).replace("\r", "").replace("\n", "") + u"。"
Example #4
0
 def get_summary(self, texts, n=3):
     texts = self.get_sentences(texts)
     doc_sents = [jieba.lcut(i) for i in texts]
     rank = TextRank(doc_sents)
     rank.text_rank()
     results = []
     for j in range(len(texts)):
         if j in rank.top_index(n):
             results.append(texts[j])
     summary = "。".join(results) + "。"
     return summary
Example #5
0
    def summary(self, doc, title=None, use_textrank_keysent=False):
        """输出文本摘要和关键词。"""
        # 处理输入
        sent_para = split_to_sentence(doc)
        self.sent_list = [
            sent.strip() for sent in chain.from_iterable(sent_para)
        ]
        sent_num = len(self.sent_list)

        with_title = False
        if title:
            self.sent_list.append(title)
            with_title = True

        pos_sent_weight = get_position_weight(sent_para)
        del sent_para

        # embedding计算
        sent_vecs, title_vec, doc_vec, total_tokens = self.__cal_sentences_vec_mat(
            with_title)

        # 由于lda从文档中抽象出topic实际上时对语义信息的另一种建模,不加入sentence embedding算法实现
        topic_dist, topic_words_dist = self.get_topic_distribution(
            total_tokens)
        topics_vec = self.__cal_topic_embedding(topic_dist, topic_words_dist)

        # keyword
        textrank = TextRank()
        self.keywords = textrank.get_keywords(doc)

        # 计算得分
        scores = self.__cal_score(sent_vecs, doc_vec, topics_vec, title_vec,
                                  pos_sent_weight, sent_num)
        score_smooth = self.__score_smooth(scores, sent_num)

        # 排序
        sorted_idx = np.argsort(score_smooth)[-sent_num // 3:]
        sent_ids = sorted(sorted_idx)

        if self.debug:
            print('key words: ', self.keywords)
            print('position weight: ', pos_sent_weight)
            print('score:', scores)
            print('score smooth:', score_smooth)
            for i in sent_ids:
                print(self.sent_list[i])

        if use_textrank_keysent:
            keysentence = textrank.get_keysentences(doc)
            print('textrank keysentence: ', keysentence)

        return ''.join([self.sent_list[i] for i in sent_ids]), \
               ';'.join([w for w, _ in self.keywords])
Example #6
0
def sentence(mongo, redis, tagger, data, bulk_op):
    start_time = time.time()
    logging.debug("sentence process start time : %f" % (start_time))

    singlewords = get_singlewords()
    coef = load_config()['coef']
    title_word_addition_multiplier = load_config(
    )['title_word_addition_multiplier']
    minimum_low_freq = load_config()['minimum_low_freq']
    low_freq_word_subtraction_multiplier = load_config(
    )['low_freq_word_subtraction_multiplier']
    nnp_addition_multiplier = load_config()['nnp_addition_multiplier']
    # get keywords, sentences using textrank algorithm
    for idx, (URI, title, content, root_domain, wordcount) in enumerate(data):
        # get stopwords from redis
        stopwords = get_stopwords(redis, root_domain)
        tr = TextRank(
            tagger=tagger,
            window=5,
            content=content,
            stopwords=stopwords,
            singlewords=singlewords,
            title=title,
            coef=coef,
            title_word_addition_multiplier=title_word_addition_multiplier,
            minimum_low_freq=minimum_low_freq,
            low_freq_word_subtraction_multiplier=
            low_freq_word_subtraction_multiplier,
            nnp_addition_multiplier=nnp_addition_multiplier)

        # build sentence graph
        tr.sentence_rank()

        # wordcount의 개수에 따라 요약율 변경
        summarize_rate = 0.3
        if wordcount < 500:
            summarize_rate = 0.3
        elif wordcount <= 1000:
            summarize_rate = 0.3
        elif wordcount <= 2000:
            summarize_rate = 0.2
        elif wordcount <= 3000:
            summarize_rate = 0.1

        # get sentence
        sentences = tr.sentences(summarize_rate)
        sys.stdout.write("\rsentence extracted: %d / %d" % (idx, len(data)))
        mongo.bulk_insert_sentences(bulk_op, URI, sentences, summarize_rate)
    end_time = time.time()
    logging.debug("sentence process end time : %f" % (end_time))
    logging.debug("total execute time : %f" % (end_time - start_time))
Example #7
0
    def get_summaries(self, limit=5):
        doc = []
        sentences = self.get_sentences()
        for sentence in sentences:
            words = list(self.seg.seg(sentence))
            words = self.filter_stop(words)
            doc.append(words)

        self.textrank = TextRank(doc)
        self.textrank.solve()
        result = []
        for index in self.textrank.top_index(limit):
            result.append(sentences[index])
        return result
Example #8
0
    def __init__(self):
        self.textranker = TextRank()
        self.ners = ['PERSON', 'ORG', 'GPE']
        self.ner_dict = {
            'PERSON': 'Person',  # People, including fictional
            'ORG': 'Organization',  # Companies, agencies, institutions, etc.
            'GPE': 'Location',  # Countries, cities, states.
        }
        # dependency markers for subjects
        self.SUBJECTS = {
            "nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"
        }
        # dependency markers for objects
        self.OBJECTS = {"dobj", "dative", "attr", "oprd"}

        self.graph_shower = GraphShow()
Example #9
0
 def 처지(self):
     tr = TextRank()
     from konlpy.tag import Komoran
     tagger = Komoran()
     stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV')])
     tr.loadSents(
         RawSentenceReader('x.txt'), lambda sent: filter(
             lambda x: x not in stopword and x[1] in
             ('NNG', 'NNP', 'VV', 'VA'), tagger.pos(sent)))
     tr.build()
     ranks = tr.rank()
     if tr.summarize(0.4) is None:
         return "모름"
     else:
         return tr.summarize(0.4)
    def summary(self):
        if self.parent.file_path != '':
            out = None

            if str(self.parent.comboBox.currentText()) == 'File':
                self.parent.text = open(self.parent.file_path, 'r').read()
            elif str(self.parent.comboBox.currentText()) == 'URL':
                if str(self.parent.internet_status.text()) == 'OFFLINE':
                    return
                self.parent.text = get_text(self.parent.file_path)

            if self.parent.set_algorithm == 'FS':
                out = fs(self.parent.text, self.parent.set_language,
                         int(self.parent.set_count))
            elif self.parent.set_algorithm == 'TextRank':
                tr = TextRank(self.parent.text, int(self.parent.set_count),
                              self.parent.set_language, self.parent.set_metric,
                              self.parent.set_graph)
                out = tr.summarize()

            self.parent.out = out
Example #11
0
def keyword(mongo, redis, tagger, data, bulk_op):
    start_time = time.time()
    logging.debug("keyword extraction start time : %f" % (start_time))

    singlewords = get_singlewords()
    coef = load_config()['coef']
    title_word_addition_multiplier = load_config(
    )['title_word_addition_multiplier']
    minimum_low_freq = load_config()['minimum_low_freq']
    nnp_addition_multiplier = load_config()['nnp_addition_multiplier']
    low_freq_word_subtraction_multiplier = load_config(
    )['low_freq_word_subtraction_multiplier']

    for idx, (URI, title, content, root_domain, wordcount) in enumerate(data):
        # get stopwords from redis
        stopwords = get_stopwords(redis, root_domain)
        tr = TextRank(
            tagger=tagger,
            window=5,
            content=content,
            stopwords=stopwords,
            singlewords=singlewords,
            title=title,
            coef=coef,
            title_word_addition_multiplier=title_word_addition_multiplier,
            minimum_low_freq=minimum_low_freq,
            low_freq_word_subtraction_multiplier=
            low_freq_word_subtraction_multiplier)

        # build keyword graph
        tr.keyword_rank()

        # get keyword 키워드의 개수는 최대 15개로 제한
        keywords = tr.keywords(num=15)
        sys.stdout.write("\rkeyword extracted: %d / %d" % (idx, len(data)))
        mongo.bulk_insert_keywords(bulk_op, URI, keywords)

    end_time = time.time()
    logging.debug("keyword extraction end time : %f" % (end_time))
    logging.debug("total execution time : %f" % (end_time - start_time))
Example #12
0
def analyze():
    if (request.method == 'POST'):
        payload = request.get_json()
        input_text = payload["text"]
        t = TextRank(input_text)
        t.analyze(50)
        t.generate_cloud().to_file("temp.png")
        return send_file("temp.png", mimetype='image/png')
Example #13
0
 def 심정(self):
     tr = TextRank(window=5, coef=1)
     stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV'), ('없', 'VV')])
     tr.load(
         RawTaggerReader('x.txt'),
         lambda w: w not in stopword and (w[1] in
                                          ('NNG', 'NNP', 'VV', 'VA')))
     tr.build()
     kw = tr.extract(0.4)
     if kw is None:
         return "모름"
     else:
         return kw
Example #14
0
    def click_summary(self):
        self.language = str(self.ui.comboBox.currentText())
        self.top_k_word = int(self.ui.comboBox_2.currentText())
        self.top_k_sent = int(self.ui.comboBox_3.currentText())

        sents = pdf_to_text(self.file_name)

        # print(self.language, self.top_k_word, self.top_k_sent)
        if self.language == "ko":
            textrank = TextRank(language=self.language,
                                tokenizer="mecab",
                                stopwords=STOPWORDS)
        else:
            textrank = TextRank(language=self.language,
                                tokenizer=None,
                                stopwords=STOPWORDS)

        keywords = textrank.keywords(sents, topk=self.top_k_word)
        keysents = textrank.summarize(sents, topk=self.top_k_sent)

        self.ui.textBrowser.setText("\n".join(keysents))
        self.ui.textBrowser_2.setText(", ".join(keywords))
Example #15
0
        args.tokenizer = None
        # stopwords of english
        stopwords = stopwords.words("english")
        stopwords += [",", "-", ":", ";", "!", "?", "'", '"']
    else:
        sents = get_data("data/sents.txt", "news")
        # stopwords of korean
        stopwords = ["뉴스", "기자", "그리고", "연합뉴스"]

    # initialize Textrank
    textrank = TextRank(
        min_count=args.min_count,
        min_sim=args.min_sim,
        tokenizer=args.tokenizer,
        noun=args.noun,
        similarity=args.similarity,
        df=args.df,
        max_iter=args.max_iter,
        method=args.method,
        stopwords=stopwords,
    )

    # extraction setences or keywords
    if args.mode == "sentences":
        results = textrank.summarize(sents, topk=args.topk)
        results = [sent for _, sent in results]
        results = "\n".join(results)
    else:
        args.mode = "words"
        results = textrank.keywords(sents, topk=args.topk)
Example #16
0
from textrank import TextRank, RawSentenceReader
from konlpy.tag import Kkma
import sys

filename = sys.argv[1]
rate = float(sys.argv[2])

tr = TextRank()
#print('Load...')
from konlpy.tag import Komoran
tagger = Komoran()
stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV') ])
tr.loadSents(RawSentenceReader(filename), lambda sent: filter(lambda x:x not in stopword and x[1] in ('NNG', 'NNP', 'VV', 'VA'), tagger.pos(sent)))
#print('Build...')
tr.build()
ranks = tr.rank()
#for k in sorted(ranks, key=ranks.get, reverse=True)[:100]:
    #print("\t".join([str(k), str(ranks[k]), str(tr.dictCount[k])]))

sentence = '%s.' % (tr.summarize(rate).split('. ')[0])

kkma = Kkma()
print(sentence)
print(list(x[0] for x in (list(filter(lambda x: x[1][0] == 'N', kkma.pos(sentence))))))

Example #17
0
import jieba
# from bm25 import BM25
from textrank import TextRank
import utils
from snownlp import seg
from sys import argv

fact = argv[1]
# fact = '公诉机关指控:2016年3月28日20时许,被告人颜某在本市洪山区马湖新村足球场马路边捡拾到被害人谢某的VIVOX5手机一部,' \
#       '并在同年3月28日2、1时起,分多次通过支付宝小额免密支付功能,秘密盗走被害人谢某支付宝内人民币3723元。案发后,被告人颜某家属已赔偿被害人全部损失,' \
#       '并取得谅解。公诉机关认为被告人颜某具有退赃、取得谅解、自愿认罪等处罚情节,建议判处被告人颜某一年以下××、××或者××,并处罚金。'
if __name__ == '__main__':

    sents = utils.get_sentences(fact)
    doc = []
    for sent in sents:
        words = seg.seg(sent)
        # words = list(jieba.cut(sent))
        words = utils.filter_stop(words)
        doc.append(words)
    # print(doc)
    # s = BM25(doc)
    # print(s.f)
    # print(s.df)
    # print(s.idf)

    rank = TextRank(doc)
    rank.text_rank()
    for index in rank.top_index(3):
        print(sents[index])
Example #18
0
import os
import json

import responder
from textrank import TextRank

env = os.environ
DEBUG = env['DEBUG'] in ['1', 'True', 'true']
RATIO = float(env['RATIO'])
MODEL = env.get('MODEL')

api = responder.API(debug=DEBUG)
textrank = TextRank(env['LIBRARY'], MODEL)


@api.route("/")
async def get_keywords(req, resp):
    body = await req.text
    text_list = json.loads(body)
    keywords_list = [textrank.keywords(text, RATIO) for text in text_list]
    resp_dict = dict(data=keywords_list)
    resp.media = resp_dict


if __name__ == "__main__":
    api.run()
Example #19
0
from textrank import TextRank
from article import get_text

text = get_text('https://www.bbc.com/news/world-us-canada-47848619')
tr = TextRank(text, lang='english', metric='log', graph='HITS')
tr.summarize()


'''
Shunichi Suzuki, who had been Olympics minister before Mr Sakurada was appointed last October, will return to the post.
In February Mr Sakurada had to make another apology, after arriving three minutes late to a parliamentary meeting.
"I deeply apologise for his remark to the people in the disaster-hit areas," said Mr Abe.
It is not the first time Mr Sakurada has been forced to apologise.
After accepting Mr Sakurada's resignation, Prime Minister Shinzo Abe apologised for appointing him.
Image copyright AFP  Japan's Olympics Minister Yoshitaka Sakurada has resigned over comments that offended people affected by a huge tsunami and earthquake in 2011.
Mr Sakurada also admitted last year to never having used a computer, despite being Japan's cyber security minister.

Shunichi Suzuki, who had been Olympics minister before Mr Sakurada was appointed last October, will return to the post.
"I deeply apologise for his remark to the people in the disaster-hit areas," said Mr Abe.
It is not the first time Mr Sakurada has been forced to apologise.
After accepting Mr Sakurada's resignation, Prime Minister Shinzo Abe apologised for appointing him.
Image copyright AFP  Japan's Olympics Minister Yoshitaka Sakurada has resigned over comments that offended people affected by a huge tsunami and earthquake in 2011.
The 2011 tsunami left more than 20,000 dead and caused a meltdown at the Fukushima Daiichi nuclear plant.
Mr Sakurada also admitted last year to never having used a computer, despite being Japan's cyber security minister.

'''
Example #20
0
class NewsMining():
    """News Mining"""
    def __init__(self):
        self.textranker = TextRank()
        self.ners = ['PERSON', 'ORG', 'GPE']
        self.ner_dict = {
            'PERSON': 'Person',  # People, including fictional
            'ORG': 'Organization',  # Companies, agencies, institutions, etc.
            'GPE': 'Location',  # Countries, cities, states.
        }
        # dependency markers for subjects
        self.SUBJECTS = {
            "nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"
        }
        # dependency markers for objects
        self.OBJECTS = {"dobj", "dative", "attr", "oprd"}

        self.graph_shower = GraphShow()

    def clean_spaces(self, s):
        s = s.replace('\r', '')
        s = s.replace('\t', ' ')
        s = s.replace('\n', ' ')
        return s

    def remove_noisy(self, content):
        """Remove brackets"""
        p1 = re.compile(r'([^)]*)')
        p2 = re.compile(r'\([^\)]*\)')
        return p2.sub('', p1.sub('', content))

    def collect_ners(self, ents):
        """Collect token only with PERSON, ORG, GPE"""
        collected_ners = []
        for token in ents:
            if token.label_ in self.ners:
                collected_ners.append(token.text + '/' + token.label_)
        return collected_ners

    def conll_syntax(self, sent):
        """Convert one sentence to conll format."""

        tuples = list()
        for word in sent:
            if word.head is word:
                head_idx = 0
            else:
                head_idx = word.head.i + 1
            tuples.append([
                word.i + 1,  # Current word index, begin with 1
                word.text,  # Word
                word.lemma_,  # Lemma
                word.pos_,  # Coarse-grained tag
                word.tag_,  # Fine-grained tag
                '_',
                head_idx,  # Head of current  Index
                word.dep_,  # Relation
                '_',
                '_'
            ])
        return tuples

    def syntax_parse(self, sent):
        """Convert one sentence to conll format."""
        tuples = list()
        for word in sent:
            if word.head is word:
                head_idx = 0
            else:
                head_idx = word.head.i + 1
            tuples.append([
                word.i + 1,  # Current word index, begin with 1
                word.text,  # Word
                word.pos_,  # Coarse-grained tag
                word.head,
                head_idx,  # Head of current  Index
                word.dep_,  # Relation
            ])
        return tuples

    def build_parse_chile_dict(self, sent, tuples):
        child_dict_list = list()
        for word in sent:
            child_dict = dict()
            for arc in tuples:
                if arc[3] == word:
                    if arc[-1] in child_dict:
                        child_dict[arc[-1]].append(arc)
                    else:
                        child_dict[arc[-1]] = []
                        child_dict[arc[-1]].append(arc)
            child_dict_list.append([word, word.pos_, word.i, child_dict])
        return child_dict_list

    def complete_VOB(self, verb, child_dict_list):
        '''Find VOB by SBV'''
        for child in child_dict_list:
            word = child[0]
            # child_dict: {'dobj': [[7, 'startup', 'NOUN', buying, 5, 'dobj']], 'prep': [[8, 'for', 'ADP', buying, 5, 'prep']]}
            child_dict = child[3]
            if word == verb:
                for object_type in self.OBJECTS:  # object_type: 'dobj'
                    if object_type not in child_dict:
                        continue
                    # [7, 'startup', 'NOUN', buying, 5, 'dobj']
                    vob = child_dict[object_type][0]
                    obj = vob[1]  # 'startup'
                    return obj
        return ''

    def extract_triples(self, sent):
        svo = []
        tuples = self.syntax_parse(sent)
        child_dict_list = self.build_parse_chile_dict(sent, tuples)
        for tuple in tuples:
            rel = tuple[-1]
            if rel in self.SUBJECTS:
                sub_wd = tuple[1]
                verb_wd = tuple[3]
                obj = self.complete_VOB(verb_wd, child_dict_list)
                subj = sub_wd
                verb = verb_wd.text
                if not obj:
                    svo.append([subj, verb])
                else:
                    svo.append([subj, verb + ' ' + obj])
        return svo

    def extract_keywords(self, words_postags):
        return self.textranker.extract_keywords(words_postags, 10)

    def collect_coexist(self, ner_sents, ners):
        """Construct NER co-occurrence matrices"""
        co_list = []
        for words in ner_sents:
            co_ners = set(ners).intersection(set(words))
            co_info = self.combination(list(co_ners))
            co_list += co_info
        if not co_list:
            return []
        return {i[0]: i[1] for i in Counter(co_list).most_common()}

    def combination(self, a):
        '''list all combination'''
        combines = []
        if len(a) == 0:
            return []
        for i in a:
            for j in a:
                if i == j:
                    continue
                combines.append('@'.join([i, j]))
        return combines

    def main(self, content):
        '''Main function'''
        if not content:
            return []

        words_postags = []  # token and its POS tag
        ner_sents = []  # store sentences which contain NER entity
        ners = []  # store all NER entity from whole article
        triples = []  # store subject verb object
        events = []  # store events

        # 01 remove linebreaks and brackets
        content = self.remove_noisy(content)
        content = self.clean_spaces(content)

        # 02 split to sentences
        doc = nlp(content)

        for i, sent in enumerate(doc.sents):
            words_postags = [[token.text, token.pos_] for token in sent]
            words = [token.text for token in sent]
            postags = [token.pos_ for token in sent]
            ents = nlp(sent.text).ents  # NER detection
            collected_ners = self.collect_ners(ents)

            if collected_ners:  # only extract triples when the sentence contains 'PERSON', 'ORG', 'GPE'
                triple = self.extract_triples(sent)
                if not triple:
                    continue
                triples += triple
                ners += collected_ners
                ner_sents.append(
                    [token.text + '/' + token.label_ for token in sent.ents])

        # 03 get keywords
        keywords = [i[0] for i in self.extract_keywords(words_postags)]
        for keyword in keywords:
            name = keyword
            cate = 'keyword'
            events.append([name, cate])

        # 04 add triples to event only the word in keyword
        for t in triples:
            if (t[0] in keywords
                    or t[1] in keywords) and len(t[0]) > 1 and len(t[1]) > 1:
                events.append([t[0], t[1]])

        # 05 get word frequency and add to events
        word_dict = [
            i for i in Counter([
                i[0] for i in words_postags
                if i[1] in ['NOUN', 'PROPN', 'VERB'] and len(i[0]) > 1
            ]).most_common()
        ][:10]
        for wd in word_dict:
            name = wd[0]
            cate = 'frequency'
            events.append([name, cate])

        # 06 get NER from whole article
        ner_dict = {i[0]: i[1] for i in Counter(ners).most_common(20)}
        for ner in ner_dict:
            name = ner.split('/')[0]  # Jessica Miller
            cate = self.ner_dict[ner.split('/')[1]]  # PERSON
            events.append([name, cate])

        # 07 get all NER entity co-occurrence information
        # here ner_dict is from above 06
        co_dict = self.collect_coexist(ner_sents, list(ner_dict.keys()))
        co_events = [[
            i.split('@')[0].split('/')[0],
            i.split('@')[1].split('/')[0]
        ] for i in co_dict]
        events += co_events

        # 08 show event graph
        self.graph_shower.create_page(events)
Example #21
0

parser = argparse.ArgumentParser()
parser.add_argument("--infer_path", default=None, type=str)
parser.add_argument("--save_path", default='./output/', type=str)
parser.add_argument("--use_textrank", default=0, type=int)
parser.add_argument('--max_len', type=int, default=512, help='max seq len')
parser.add_argument('--col_name',
                    type=str,
                    default='text_original',
                    help='column name')
args = parser.parse_args()

if args.use_textrank:
    print('use textrank')
    tr = TextRank()
    data = tr.predict(args.infer_path)
else:
    print('use kobart only')
    data = pd.read_csv(args.infer_path)
    # data['article_concat'] = data.article_original.apply(concat)


#pretrain_kobart_model use
def load_model():
    model = BartForConditionalGeneration.from_pretrained('./kobart_summary')
    return model


model = load_model()
tokenizer = get_kobart_tokenizer()
Example #22
0
# coding: utf-8
from textrank import TextRank  #textrank 모듈 불러오기

f = open("text.txt", 'r', encoding='utf-8')  #stopwords 템플릿
text = f.read()
tr = TextRank(text)  #textrank 실행
f.close()
i = 1
for row in tr.summarize(3):  #요약된 문장과 키워드 출력
    print(str(i) + '. ' + row)
    i += 1
print('keywords :', tr.keywords())
Example #23
0
# output
parser.add_argument("--output_path",
                    type=str,
                    required=True,
                    help="directory for results")

args = parser.parse_args()

if __name__ == "__main__":

    # initialize Textrank
    model = TextRank(
        min_count=args.min_count,
        min_sim=args.min_sim,
        tokenizer=args.tokenizer,
        noun=args.noun,
        similarity=args.similarity,
        df=args.df,
        method=args.method,
        stopwords=None,
    )

    data = get_data(args.test_path)

    output_path = args.output_path
    hyp_path = f"{output_path}/hyp"
    abs_ref_path = f"{output_path}/abs_ref"

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    if not os.path.exists(hyp_path):
        os.makedirs(hyp_path)
Example #24
0
def textrank(text):
    return TextRank(text=text)
Example #25
0
from textrank import TextRank, RawTaggerReader
import sys

filename = sys.argv[1]
rate = float(sys.argv[2])

tr = TextRank(window=5, coef=1)
#print('Load...')
stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV'), ('없', 'VV')])
tr.load(RawTaggerReader(filename),
        lambda w: w not in stopword and (w[1] in ('NNG', 'NNP', 'VV', 'VA')))
#print('Build...')
tr.build()
kw = tr.extract(rate)
for k in sorted(kw, key=kw.get, reverse=True):
    text = ''
    for i in range(len(k)):
        text = '%s %s' % (text, k[i][0])
    text = '%s %f' % (text, kw[k])
    print(text)
Example #26
0
class Order:
    def __init__(self, text, seg=None, tagger=None):
        self.text = text
        self.tagger = tagger if tagger is not None else self.get_tagger()
        self.seg = seg if seg is not None else self.get_seg()
        self.words_merge = None

    def get_keywords(self, limit=5, merge=False):
        doc = []
        sentences = self.get_sentences()
        for sentence in sentences:
            words = list(self.seg.seg(sentence))
            words = self.filter_stop(words)
            doc.append(words)

        self.keywordrank = KeywordRank(doc)
        self.keywordrank.solve()
        result = []
        for w in self.keywordrank.top_index(limit):
            result.append(w)

        if merge:
            wm = self.words_merge.merge(self.text, result)
            return wm.merge()
        return result

    def get_summaries(self, limit=5):
        doc = []
        sentences = self.get_sentences()
        for sentence in sentences:
            words = list(self.seg.seg(sentence))
            words = self.filter_stop(words)
            doc.append(words)

        self.textrank = TextRank(doc)
        self.textrank.solve()
        result = []
        for index in self.textrank.top_index(limit):
            result.append(sentences[index])
        return result

    def get_sentences(self):
        line_break_re = re.compile('[\r\n]')
        delimiter_re = re.compile('[,。?!;]')
        sentences = []
        for line in line_break_re.split(self.text):
            line = line.strip()
            if not line:
                continue

            for sentence in delimiter_re.split(line):
                sentence = sentence.strip()
                if not sentence:
                    continue
                sentences.append(sentence)

        return sentences

    def get_seg(self, fname='seg.pickle'):
        seg = Seg()
        seg.load(fname)
        return seg

    def get_tagger(self, fname='tag.pickle'):
        tagger = Tag()
        tagger.load(fname)
        return tagger

    def filter_stop(self, words):
        return list(filter(lambda x: x not in stop_words, words))
Example #27
0
from textrank import TextRank
import sys
import os
from flask import Flask, request, render_template, send_file

app = Flask(__name__)


@app.route('/')
def main():
    return render_template('index.html')


@app.route('/analyze', methods=['POST'])
def analyze():
    if (request.method == 'POST'):
        payload = request.get_json()
        input_text = payload["text"]
        t = TextRank(input_text)
        t.analyze(50)
        t.generate_cloud().to_file("temp.png")
        return send_file("temp.png", mimetype='image/png')


if __name__ == '__main__':
    input_file = open(sys.argv[1])
    t = TextRank(input_file.read(), iterations=100)
    t.analyze(30)
    t.generate_cloud()
    # app.config['TEMPLATES_AUTO_RELOAD'] = True
    # app.run(host='0.0.0.0', port=8080)