def youtube_to_summary(self, link, ts='tr', tr_option='krwordrank'): self.link = link text_result = self.youtube_to_text(link) for x in text_result: if ts == 'tr': summarize_result = self.textrank_summary(x, tr_option) # elif ts == 'bs': # summarize_result = self.bert_summary(x) print('-' * 109) # print(' ') print('{} 영상에 대한 요약 결과입니다.'.format(link)) # print('-'*40+'요약 결과'+'-'*40) for summarize in summarize_result: print('> ' + summarize) # self.speak(summarize) # print(' ') print('-' * 109) # print(' ') count_na = self.count(x) # keyword = '#' + ' #'.join([x for x, y in count_na.most_common(6)]) keyword_dic = summarize_with_keywords(x.split(". "), num_keywords=5, min_count=3) keyword = '#' + ' #'.join([x for x in list(keyword_dic.keys())]) print('{} 영상의 키워드: {}'.format(link, keyword)) # print(' ') print('-' * 109) # print(' ') self.wordcloud(count_na) print('-' * 109) # print(' ') print('{} 영상의 전체 텍스트입니다.'.format(link)) for sent in x.split('. '): print(sent + '.')
def find_keyword(file_name): font_path = './NanumFont/NanumBarunGothic.ttf' stopwords = { "\',", "함수", "_flash_removeCallback()", "flash", "//", "\",", "\'\\n", "\'\\", "10", "11", "17", "20", "30", "28일", "오류를", "29일", "위한", "것으로", "있다." } keywords = summarize_with_keywords(texts, min_count=5, max_length=10, beta=0.85, max_iter=10, stopwords=stopwords) krwordrank_cloud = WordCloud(font_path=font_path, width=800, height=800, background_color="white") krwordrank_cloud = krwordrank_cloud.generate_from_frequencies(keywords) fig = plt.figure(figsize=(10, 10)) plt.imshow(krwordrank_cloud, interpolation="bilinear") plt.axis('off') plt.show() fig.savefig(file_name) for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:30]: print('%8s:\t%.4f' % (word, score))
def create_wordcloud(list_of_words, add_stopwords=[], min_count=5, max_length=10, beta=0.85, max_iter=20, verbose=False): # load keywords from json file and create set with open('stopwords-ko.json') as json_file: stopwords = json.load(json_file) stopwords = set(stopwords) for word in add_stopwords: stopwords.add(word) keywords = summarize_with_keywords(list_of_words, min_count=min_count, max_length=max_length, beta=beta, max_iter=max_iter, stopwords=stopwords, verbose=verbose) # creating wordcloud wordcloud = WordCloud(font_path='NanumBarunGothic.ttf', width=800, height=800, background_color="white") cloud = wordcloud.generate_from_frequencies(keywords) plt.figure(figsize=(16, 10)) plt.imshow(cloud, interpolation='bilinear') plt.axis("off") plt.show() return wordcloud
def tag_extlist(request): if request.method == 'POST': # min_count = 1 # max_length = 10 # wordrank_extractor = KRWordRank(min_count, max_length) req_data = json.loads(request.body.decode()) response_tags = req_data['description'] keywords = summarize_with_keywords([response_tags], min_count=5, max_length=10, beta=0.85, max_iter=20, stopwords={}, verbose=True) arr = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10] response_dict = {} word = list(map(lambda a: a[0], arr)) num = list(map(lambda a: a[1], arr)) # first select tag with highest value for i in range(0, 10): if len(word) == i: break # drop tag if it exists in list and selected ratio is lower than 0.5 if Tag.objects.filter(name=word[i]).exists(): tag = Tag.objects.get(name=word[i]) if tag.selected != 0 and tag.suggested < tag.selected * 2: response_dict[word[i]] = num[i] else: response_dict[word[i]] = num[i] return JsonResponse(response_dict, safe=False) else: return HttpResponse(status=405)
def find_keyword(fname): keywords = summarize_with_keywords(texts, min_count=5, max_length=10, beta=0.85, max_iter=10, stopwords=stopwords) print('\n\n ' + fname + '키워드 분석 결과') for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:30]: print('%8s:\t%.4f' % (word, score)) krwordrank_cloud = WordCloud(font_path=font_path, width=800, height=800, background_color="white") krwordrank_cloud = krwordrank_cloud.generate_from_frequencies(keywords) fig = plt.figure(figsize=(10, 10)) plt.imshow(krwordrank_cloud, interpolation="bilinear") plt.axis('off') plt.show() fig.savefig(fname + '.png')
# temp = np.array(w['content']).flatten().tolist() # temp_list.extend(temp) else: # print(type(w['content'][0])) # print(w['content']) temp_list.extend(w['content']) return temp_list json_path = './server_project/search_app/result/iGate Introduction.json' contents_list = json_2_list_Contents(json_path) stopwords = {'Service', '처리', '[그림', '있다.', 'ID', '다양한'} beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 #keywords, rank, graph = wordrank_extractor.extract(temp_list, beta, max_iter) #passwords = {word:score for word, score in sorted(keywords.items(), key=lambda x:-x[1])[:300] if not (word in stopwords)} keywords = summarize_with_keywords(contents_list, min_count=5, max_length=10, beta=0.85, max_iter=10, stopwords=stopwords, verbose=True) print(keywords) # for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]: # print('%8s:\t%.4f' % (word, r))
elif ratings == "4": texts_4.append(comment) elif ratings == "5": texts_5.append(comment) except: pass else: reviewIdx=reviewIdx+1 print("["+str(reviewIdx)+"/"+str(reviewCount)+"]", end='\r') print("Done. ["+str(reviewIdx)+"/"+str(reviewCount)+"]") file.close() print("\n") print("5 ====") keywords = summarize_with_keywords(texts_5, stopwords=stopwords, verbose=True) for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r)) print("4 ====") keywords = summarize_with_keywords(texts_4, stopwords=stopwords, verbose=True) for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r)) print("3 ====") keywords = summarize_with_keywords(texts_3, stopwords=stopwords, verbose=True) for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r)) print("2 ====") keywords = summarize_with_keywords(texts_2, stopwords=stopwords, verbose=True)
import csv import pandas as pd from krwordrank.word import summarize_with_keywords keyword_list = [] data = pd.read_csv('data.csv', names=['texts', 'scores'], encoding='UTF-8') texts_data = data['texts'] texts_val = texts_data.values texts = texts_val.tolist() # stopwords : 키워드에서 제거될 단어 #stopwords = {'자료:', '20', '따른', '수는', '경우', '-1', '주:', '것으로', '대한'} keywords = summarize_with_keywords(texts, min_count=5, max_length=10, beta=0.85, max_iter=10) #, stopwords=stopwords) #print(keywords) for word, r in sorted(keywords.items(), key=lambda x: x[1])[:50]: print('%8s:\t%.4f' % (word, r)) keyword_list.append([word, r]) with open('keyword.csv', 'w', newline='', encoding='utf-8-sig') as f: makewrite = csv.writer(f) for value in keyword_list: makewrite.writerow(value)
else: texts.append(delete_word(comment_text)) time.sleep(1) print(texts) wordrank_extractor = KRWordRank( min_count=3, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 # keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) keywords = summarize_with_keywords(texts) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r)) krwordrank_cloud = WordCloud(font_path=font_path, width=800, height=800, background_color="white") popstrings = "내가 그냥 근데 너무" # keywords = pop_keyword(keywords, popstrings) krwordrank_cloud = krwordrank_cloud.generate_from_frequencies(keywords) fig = plt.figure(figsize=(10, 10)) plt.imshow(krwordrank_cloud, interpolation="bilinear") plt.axis("off")
import math import pandas as pd import numpy as np from krwordrank.word import summarize_with_keywords text = pd.read_csv('review.txt', header=None, delimiter='|') idx_foreign = text[text[1] != 'ko'].index ko_text = text.drop(idx_foreign) reviews = list(np.array(ko_text[3].tolist())) # wordrank_extractor = KRWordRank(min_count=5, max_length=10) # keywords, rank, graph = wordrank_extractor.extract(reviews, num_keywords=10) keywords = summarize_with_keywords(reviews, min_count=5, max_length=10, num_keywords=2366, beta=0.85, max_iter=10, verbose=True) with open('tagsforall.txt', 'a', encoding='utf-8') as file: for k in keywords: temptext = str(k) + '|' + str(keywords[k]) + '\n' file.write(temptext)
stopwords = {'com', 'kr', 'http', 'products'} passwords = { word: score for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:300] if not (word in stopwords) } # In[14]: from krwordrank.word import summarize_with_keywords keywords = summarize_with_keywords(texts, min_count=5, max_length=10, beta=0.85, max_iter=10, stopwords=stopwords, verbose=True) keywords = summarize_with_keywords(texts) # with default arguments # In[15]: from wordcloud import WordCloud # Set your font path font_path = 'YOUR_FONT_DIR/truetype/nanum/NanumBarunGothic.ttf' krwordrank_cloud = WordCloud(font_path=font_path, width=800, height=800,
content = false_tag[1] sentences = kkma.sentences(content) #추출된 내용의 문장을 리스트로 나눔. nouns = [] for sentence in sentences: if sentence is not '': nouns.append(' '.join([ noun for noun in komoran.nouns(str(sentence)) #nouns.append(' '.join([noun for noun in kkma.nouns(str(sentence)) if noun not in stopwords and len(noun) > 1 ])) keywords = summarize_with_keywords(nouns, min_count=3, max_length=10, beta=0.85, max_iter=10, stopwords=stopwords, verbose=True) print(keywords) #=========================태그 하나씩만 추출 from krwordrank.sentence import summarize_with_sentences stopwords = { '영화', '관람객', '너무', '정말', '보고', '우리', '아니', '대상', '것이다', '있는', '것으로', '웨어' } for false_tag in false_tags: doc_id = false_tag[0] #도큐먼트 id 추출 content = false_tag[1]