def keyword(mongo, redis, tagger, data, bulk_op): start_time = time.time() logging.debug("keyword extraction start time : %f" % (start_time)) singlewords = get_singlewords() coef = load_config()['coef'] title_word_addition_multiplier = load_config( )['title_word_addition_multiplier'] minimum_low_freq = load_config()['minimum_low_freq'] nnp_addition_multiplier = load_config()['nnp_addition_multiplier'] low_freq_word_subtraction_multiplier = load_config( )['low_freq_word_subtraction_multiplier'] for idx, (URI, title, content, root_domain, wordcount) in enumerate(data): # get stopwords from redis stopwords = get_stopwords(redis, root_domain) tr = TextRank( tagger=tagger, window=5, content=content, stopwords=stopwords, singlewords=singlewords, title=title, coef=coef, title_word_addition_multiplier=title_word_addition_multiplier, minimum_low_freq=minimum_low_freq, low_freq_word_subtraction_multiplier= low_freq_word_subtraction_multiplier) # build keyword graph tr.keyword_rank() # get keyword 키워드의 개수는 최대 15개로 제한 keywords = tr.keywords(num=15) sys.stdout.write("\rkeyword extracted: %d / %d" % (idx, len(data))) mongo.bulk_insert_keywords(bulk_op, URI, keywords) end_time = time.time() logging.debug("keyword extraction end time : %f" % (end_time)) logging.debug("total execution time : %f" % (end_time - start_time))
def click_summary(self): self.language = str(self.ui.comboBox.currentText()) self.top_k_word = int(self.ui.comboBox_2.currentText()) self.top_k_sent = int(self.ui.comboBox_3.currentText()) sents = pdf_to_text(self.file_name) # print(self.language, self.top_k_word, self.top_k_sent) if self.language == "ko": textrank = TextRank(language=self.language, tokenizer="mecab", stopwords=STOPWORDS) else: textrank = TextRank(language=self.language, tokenizer=None, stopwords=STOPWORDS) keywords = textrank.keywords(sents, topk=self.top_k_word) keysents = textrank.summarize(sents, topk=self.top_k_sent) self.ui.textBrowser.setText("\n".join(keysents)) self.ui.textBrowser_2.setText(", ".join(keywords))
# coding: utf-8 from textrank import TextRank #textrank 모듈 불러오기 f = open("text.txt", 'r', encoding='utf-8') #stopwords 템플릿 text = f.read() tr = TextRank(text) #textrank 실행 f.close() i = 1 for row in tr.summarize(3): #요약된 문장과 키워드 출력 print(str(i) + '. ' + row) i += 1 print('keywords :', tr.keywords())
else: sents = get_data("data/sents.txt", "news") # stopwords of korean stopwords = ["뉴스", "기자", "그리고", "연합뉴스"] # initialize Textrank textrank = TextRank( min_count=args.min_count, min_sim=args.min_sim, tokenizer=args.tokenizer, noun=args.noun, similarity=args.similarity, df=args.df, max_iter=args.max_iter, method=args.method, stopwords=stopwords, ) # extraction setences or keywords if args.mode == "sentences": results = textrank.summarize(sents, topk=args.topk) results = [sent for _, sent in results] results = "\n".join(results) else: args.mode = "words" results = textrank.keywords(sents, topk=args.topk) print(f"{args.mode}") print("=" * 20) print(f"{results}")