Example #1
0
 def youtube_to_summary(self, link, ts='tr', tr_option='krwordrank'):
     self.link = link
     text_result = self.youtube_to_text(link)
     for x in text_result:
         if ts == 'tr':
             summarize_result = self.textrank_summary(x, tr_option)
         # elif ts == 'bs':
         #     summarize_result = self.bert_summary(x)
         print('-' * 109)
         # print(' ')
         print('{} 영상에 대한 요약 결과입니다.'.format(link))
         # print('-'*40+'요약 결과'+'-'*40)
         for summarize in summarize_result:
             print('> ' + summarize)
             # self.speak(summarize)
         # print(' ')
         print('-' * 109)
         # print(' ')
         count_na = self.count(x)
         # keyword = '#' + ' #'.join([x for x, y in count_na.most_common(6)])
         keyword_dic = summarize_with_keywords(x.split(". "),
                                               num_keywords=5,
                                               min_count=3)
         keyword = '#' + ' #'.join([x for x in list(keyword_dic.keys())])
         print('{} 영상의 키워드: {}'.format(link, keyword))
         # print(' ')
         print('-' * 109)
         # print(' ')
         self.wordcloud(count_na)
         print('-' * 109)
         # print(' ')
         print('{} 영상의 전체 텍스트입니다.'.format(link))
         for sent in x.split('. '):
             print(sent + '.')
Example #2
0
def find_keyword(file_name):
    font_path = './NanumFont/NanumBarunGothic.ttf'
    stopwords = {
        "\',", "함수", "_flash_removeCallback()", "flash", "//", "\",", "\'\\n",
        "\'\\", "10", "11", "17", "20", "30", "28일", "오류를", "29일", "위한", "것으로",
        "있다."
    }
    keywords = summarize_with_keywords(texts,
                                       min_count=5,
                                       max_length=10,
                                       beta=0.85,
                                       max_iter=10,
                                       stopwords=stopwords)

    krwordrank_cloud = WordCloud(font_path=font_path,
                                 width=800,
                                 height=800,
                                 background_color="white")

    krwordrank_cloud = krwordrank_cloud.generate_from_frequencies(keywords)
    fig = plt.figure(figsize=(10, 10))
    plt.imshow(krwordrank_cloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()
    fig.savefig(file_name)

    for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:30]:
        print('%8s:\t%.4f' % (word, score))
Example #3
0
def create_wordcloud(list_of_words,
                     add_stopwords=[],
                     min_count=5,
                     max_length=10,
                     beta=0.85,
                     max_iter=20,
                     verbose=False):
    # load keywords from json file and create set
    with open('stopwords-ko.json') as json_file:
        stopwords = json.load(json_file)
    stopwords = set(stopwords)

    for word in add_stopwords:
        stopwords.add(word)

    keywords = summarize_with_keywords(list_of_words,
                                       min_count=min_count,
                                       max_length=max_length,
                                       beta=beta,
                                       max_iter=max_iter,
                                       stopwords=stopwords,
                                       verbose=verbose)
    # creating wordcloud
    wordcloud = WordCloud(font_path='NanumBarunGothic.ttf',
                          width=800,
                          height=800,
                          background_color="white")

    cloud = wordcloud.generate_from_frequencies(keywords)
    plt.figure(figsize=(16, 10))
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    return wordcloud
Example #4
0
def tag_extlist(request):
    if request.method == 'POST':
        # min_count = 1
        # max_length = 10
        # wordrank_extractor = KRWordRank(min_count, max_length)
        req_data = json.loads(request.body.decode())
        response_tags = req_data['description']
        keywords = summarize_with_keywords([response_tags],
                                           min_count=5,
                                           max_length=10,
                                           beta=0.85,
                                           max_iter=20,
                                           stopwords={},
                                           verbose=True)
        arr = sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
        response_dict = {}
        word = list(map(lambda a: a[0], arr))
        num = list(map(lambda a: a[1], arr))

        # first select tag with highest value
        for i in range(0, 10):
            if len(word) == i:
                break
            # drop tag if it exists in list and selected ratio is lower than 0.5
            if Tag.objects.filter(name=word[i]).exists():
                tag = Tag.objects.get(name=word[i])
                if tag.selected != 0 and tag.suggested < tag.selected * 2:
                    response_dict[word[i]] = num[i]
            else:
                response_dict[word[i]] = num[i]
        return JsonResponse(response_dict, safe=False)
    else:
        return HttpResponse(status=405)
Example #5
0
def find_keyword(fname):
    keywords = summarize_with_keywords(texts,
                                       min_count=5,
                                       max_length=10,
                                       beta=0.85,
                                       max_iter=10,
                                       stopwords=stopwords)

    print('\n\n ' + fname + '키워드 분석 결과')
    for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:30]:
        print('%8s:\t%.4f' % (word, score))

    krwordrank_cloud = WordCloud(font_path=font_path,
                                 width=800,
                                 height=800,
                                 background_color="white")

    krwordrank_cloud = krwordrank_cloud.generate_from_frequencies(keywords)
    fig = plt.figure(figsize=(10, 10))
    plt.imshow(krwordrank_cloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()
    fig.savefig(fname + '.png')
                # temp = np.array(w['content']).flatten().tolist()
                # temp_list.extend(temp)

            else:
                # print(type(w['content'][0]))
                # print(w['content'])
                temp_list.extend(w['content'])
        return temp_list


json_path = './server_project/search_app/result/iGate Introduction.json'
contents_list = json_2_list_Contents(json_path)

stopwords = {'Service', '처리', '[그림', '있다.', 'ID', '다양한'}

beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10
#keywords, rank, graph = wordrank_extractor.extract(temp_list, beta, max_iter)
#passwords = {word:score for word, score in sorted(keywords.items(), key=lambda x:-x[1])[:300] if not (word in stopwords)}

keywords = summarize_with_keywords(contents_list,
                                   min_count=5,
                                   max_length=10,
                                   beta=0.85,
                                   max_iter=10,
                                   stopwords=stopwords,
                                   verbose=True)
print(keywords)
# for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]:
#    print('%8s:\t%.4f' % (word, r))
            elif ratings == "4":
                texts_4.append(comment)
            elif ratings == "5":
                texts_5.append(comment)
        except:
            pass
        else:
            reviewIdx=reviewIdx+1
            print("["+str(reviewIdx)+"/"+str(reviewCount)+"]", end='\r')
    print("Done. ["+str(reviewIdx)+"/"+str(reviewCount)+"]")

file.close()

print("\n")
print("5 ====")
keywords = summarize_with_keywords(texts_5, stopwords=stopwords, verbose=True)
for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))

print("4 ====")
keywords = summarize_with_keywords(texts_4, stopwords=stopwords, verbose=True)
for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))

print("3 ====")
keywords = summarize_with_keywords(texts_3, stopwords=stopwords, verbose=True)
for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))

print("2 ====")
keywords = summarize_with_keywords(texts_2, stopwords=stopwords, verbose=True)
import csv
import pandas as pd
from krwordrank.word import summarize_with_keywords

keyword_list = []
data = pd.read_csv('data.csv', names=['texts', 'scores'], encoding='UTF-8')

texts_data = data['texts']
texts_val = texts_data.values
texts = texts_val.tolist()

# stopwords : 키워드에서 제거될 단어
#stopwords = {'자료:', '20', '따른', '수는', '경우', '-1', '주:', '것으로', '대한'}

keywords = summarize_with_keywords(texts,
                                   min_count=5,
                                   max_length=10,
                                   beta=0.85,
                                   max_iter=10)  #, stopwords=stopwords)

#print(keywords)

for word, r in sorted(keywords.items(), key=lambda x: x[1])[:50]:
    print('%8s:\t%.4f' % (word, r))
    keyword_list.append([word, r])

with open('keyword.csv', 'w', newline='', encoding='utf-8-sig') as f:
    makewrite = csv.writer(f)
    for value in keyword_list:
        makewrite.writerow(value)
Example #9
0
            else:
                texts.append(delete_word(comment_text))
    time.sleep(1)

print(texts)

wordrank_extractor = KRWordRank(
    min_count=3,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length=10,  # 단어의 최대 길이
    verbose=True)

beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10

# keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
keywords = summarize_with_keywords(texts)
for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))

krwordrank_cloud = WordCloud(font_path=font_path,
                             width=800,
                             height=800,
                             background_color="white")
popstrings = "내가 그냥 근데 너무"
# keywords = pop_keyword(keywords, popstrings)

krwordrank_cloud = krwordrank_cloud.generate_from_frequencies(keywords)

fig = plt.figure(figsize=(10, 10))
plt.imshow(krwordrank_cloud, interpolation="bilinear")
plt.axis("off")
Example #10
0
import math
import pandas as pd
import numpy as np
from krwordrank.word import summarize_with_keywords

text = pd.read_csv('review.txt', header=None, delimiter='|')
idx_foreign = text[text[1] != 'ko'].index
ko_text = text.drop(idx_foreign)
reviews = list(np.array(ko_text[3].tolist()))

# wordrank_extractor = KRWordRank(min_count=5, max_length=10)
# keywords, rank, graph = wordrank_extractor.extract(reviews, num_keywords=10)

keywords = summarize_with_keywords(reviews,
                                   min_count=5,
                                   max_length=10,
                                   num_keywords=2366,
                                   beta=0.85,
                                   max_iter=10,
                                   verbose=True)
with open('tagsforall.txt', 'a', encoding='utf-8') as file:
    for k in keywords:
        temptext = str(k) + '|' + str(keywords[k]) + '\n'
        file.write(temptext)
Example #11
0
stopwords = {'com', 'kr', 'http', 'products'}
passwords = {
    word: score
    for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:300]
    if not (word in stopwords)
}

# In[14]:

from krwordrank.word import summarize_with_keywords

keywords = summarize_with_keywords(texts,
                                   min_count=5,
                                   max_length=10,
                                   beta=0.85,
                                   max_iter=10,
                                   stopwords=stopwords,
                                   verbose=True)
keywords = summarize_with_keywords(texts)  # with default arguments

# In[15]:

from wordcloud import WordCloud

# Set your font path
font_path = 'YOUR_FONT_DIR/truetype/nanum/NanumBarunGothic.ttf'

krwordrank_cloud = WordCloud(font_path=font_path,
                             width=800,
                             height=800,
Example #12
0
    content = false_tag[1]
    sentences = kkma.sentences(content)  #추출된 내용의 문장을 리스트로 나눔.

    nouns = []
    for sentence in sentences:
        if sentence is not '':
            nouns.append(' '.join([
                noun for noun in komoran.nouns(str(sentence))
                #nouns.append(' '.join([noun for noun in kkma.nouns(str(sentence))
                if noun not in stopwords and len(noun) > 1
            ]))

    keywords = summarize_with_keywords(nouns,
                                       min_count=3,
                                       max_length=10,
                                       beta=0.85,
                                       max_iter=10,
                                       stopwords=stopwords,
                                       verbose=True)
    print(keywords)

#=========================태그 하나씩만 추출
from krwordrank.sentence import summarize_with_sentences

stopwords = {
    '영화', '관람객', '너무', '정말', '보고', '우리', '아니', '대상', '것이다', '있는', '것으로', '웨어'
}

for false_tag in false_tags:
    doc_id = false_tag[0]  #도큐먼트 id 추출
    content = false_tag[1]