Ejemplo n.º 1
0
def keyword(mongo, redis, tagger, data, bulk_op):
    start_time = time.time()
    logging.debug("keyword extraction start time : %f" % (start_time))

    singlewords = get_singlewords()
    coef = load_config()['coef']
    title_word_addition_multiplier = load_config(
    )['title_word_addition_multiplier']
    minimum_low_freq = load_config()['minimum_low_freq']
    nnp_addition_multiplier = load_config()['nnp_addition_multiplier']
    low_freq_word_subtraction_multiplier = load_config(
    )['low_freq_word_subtraction_multiplier']

    for idx, (URI, title, content, root_domain, wordcount) in enumerate(data):
        # get stopwords from redis
        stopwords = get_stopwords(redis, root_domain)
        tr = TextRank(
            tagger=tagger,
            window=5,
            content=content,
            stopwords=stopwords,
            singlewords=singlewords,
            title=title,
            coef=coef,
            title_word_addition_multiplier=title_word_addition_multiplier,
            minimum_low_freq=minimum_low_freq,
            low_freq_word_subtraction_multiplier=
            low_freq_word_subtraction_multiplier)

        # build keyword graph
        tr.keyword_rank()

        # get keyword 키워드의 개수는 최대 15개로 제한
        keywords = tr.keywords(num=15)
        sys.stdout.write("\rkeyword extracted: %d / %d" % (idx, len(data)))
        mongo.bulk_insert_keywords(bulk_op, URI, keywords)

    end_time = time.time()
    logging.debug("keyword extraction end time : %f" % (end_time))
    logging.debug("total execution time : %f" % (end_time - start_time))
Ejemplo n.º 2
0
    def click_summary(self):
        self.language = str(self.ui.comboBox.currentText())
        self.top_k_word = int(self.ui.comboBox_2.currentText())
        self.top_k_sent = int(self.ui.comboBox_3.currentText())

        sents = pdf_to_text(self.file_name)

        # print(self.language, self.top_k_word, self.top_k_sent)
        if self.language == "ko":
            textrank = TextRank(language=self.language,
                                tokenizer="mecab",
                                stopwords=STOPWORDS)
        else:
            textrank = TextRank(language=self.language,
                                tokenizer=None,
                                stopwords=STOPWORDS)

        keywords = textrank.keywords(sents, topk=self.top_k_word)
        keysents = textrank.summarize(sents, topk=self.top_k_sent)

        self.ui.textBrowser.setText("\n".join(keysents))
        self.ui.textBrowser_2.setText(", ".join(keywords))
Ejemplo n.º 3
0
# coding: utf-8
from textrank import TextRank  #textrank 모듈 불러오기

f = open("text.txt", 'r', encoding='utf-8')  #stopwords 템플릿
text = f.read()
tr = TextRank(text)  #textrank 실행
f.close()
i = 1
for row in tr.summarize(3):  #요약된 문장과 키워드 출력
    print(str(i) + '. ' + row)
    i += 1
print('keywords :', tr.keywords())
Ejemplo n.º 4
0
    else:
        sents = get_data("data/sents.txt", "news")
        # stopwords of korean
        stopwords = ["뉴스", "기자", "그리고", "연합뉴스"]

    # initialize Textrank
    textrank = TextRank(
        min_count=args.min_count,
        min_sim=args.min_sim,
        tokenizer=args.tokenizer,
        noun=args.noun,
        similarity=args.similarity,
        df=args.df,
        max_iter=args.max_iter,
        method=args.method,
        stopwords=stopwords,
    )

    # extraction setences or keywords
    if args.mode == "sentences":
        results = textrank.summarize(sents, topk=args.topk)
        results = [sent for _, sent in results]
        results = "\n".join(results)
    else:
        args.mode = "words"
        results = textrank.keywords(sents, topk=args.topk)

    print(f"{args.mode}")
    print("=" * 20)
    print(f"{results}")