Esempio n. 1
0
def keyword_extract():
    okt = Okt()
    data = request.get_json()
    texts = data
    wordrank_extractor = KRWordRank(
        min_count=4,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length=12,  # 단어의 최대 길이
        verbose=True)

    beta = 0.5  # PageRank의 decaying factor beta
    max_iter = 10
    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
    if request.method == 'POST':
        word_list = list()
        test = {}
        r_list = list()
        for word, r in sorted(keywords.items(),
                              key=lambda x: x[1],
                              reverse=True)[:30]:
            # print('%8s:\t%.4f' % (word, r))
            word_list.append(word)
            r_list.append(r)
            test[word] = r
        new_word_list = [' '.join(okt.nouns(word)) for word in test]
        while '' in new_word_list:
            new_word_list.remove('')
        print(new_word_list)
        print(test.keys())
        # print(test)
        # print(word_list)
        return json.dumps(test, ensure_ascii=False)
    return 'wordExtract'
def make_list():
    file_list = search('./Result_free/')  # 폴더 경로만 바꿔서 사용하세요~
    data = file_read(file_list)

    wordrank_extractor = KRWordRank(
        min_count=10,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length=15,  # 단어의 최대 길이
        verbose=True)

    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10

    input_text = str()
    text = list()
    for content in data:
        text.append(content['text'])
        #text.append(content['text']) #게시판에 타이틀이 있으면 주석해제
        for comment in content['comment_text']:
            text.append(comment)

    input_text = ' '.join(text)

    keywords = None
    rank = ''
    graph = None
    keywords, rank, graph = wordrank_extractor.extract(text, beta, max_iter)

    return keywords, rank, graph
Esempio n. 3
0
def krwordrank_noun(sentence_list=[],
                    min_count=5,
                    max_length=10,
                    beta=0.85,
                    max_iter=10,
                    verbose=False):
    krword_rank_noun = []
    krword_rank_noun_score = {}

    wordrank_extractor = KRWordRank(min_count, max_length, verbose)
    try:
        keywords, rank, graph = wordrank_extractor.extract(
            sentence_list, beta, max_iter)
        for word, r in sorted(keywords.items(),
                              key=lambda x: x[1],
                              reverse=True)[:len(keywords)]:
            # print(r, word)
            word = re.sub("[\s]+", " ", word)
            if len(word) > 1:
                word_cleansing = re.sub(
                    '[-=+,#/\?:^$.@*\"※~&%ㆍ!”』\\‘|\(\)\[\]\<\>`\'…》\^\)\(▶]',
                    '', word)
                if len(word_cleansing) == len(word):
                    krword_rank_noun.append(word)
                    krword_rank_noun_score[word] = r
        return sorted_dict(krword_rank_noun_score)
    except:
        krword_rank_noun = []
        krword_rank_noun_score = {}
        return sorted_dict(krword_rank_noun_score)
Esempio n. 4
0
    def mapper(row: pd.Series):
        extractor = KRWordRank(
            min_count=7,  # Minimum word occurrence
            max_length=15,  # Maximum word length
            verbose=False,
        )
        beta = 0.85  # decaying factor beta of PageRank
        max_iter = 10

        sentences = generate_input(row["articles"])

        try:
            score, rank, graph = extractor.extract(sentences, beta, max_iter)
            score = dict(filter(filter_stopwords, score.items()))
        except Exception as e:
            print(e)
            return None

        return dict({
            "date": os.path.splitext(_f)[0],
            "press": row["press"],
            "category": row["category"],
            "size": len(" ".join(sentences).encode("utf8")),
            "score": score,
            "rank_size": len(rank),
            "graph_size": len(graph),
        })
Esempio n. 5
0
def wordRank():
    #Retreive text from elasticsearch
    results = es.get(index='nkdb',
                     doc_type='nkdb',
                     id='5dc9fc5033ec463330e97e94')
    texts = json.dumps(results['_source'], ensure_ascii=False)

    # split the text by sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', texts)

    # normalize the text
    texts = [normalize(text, number=True) for text in sentences]

    wordrank_extractor = KRWordRank(
        min_count=3,  # Minimum frequency of word
        max_length=10,  # Maximum length of word
        verbose=True)

    beta = 0.85  # Decaying factor beta of PageRank
    max_iter = 10

    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

    result = []
    dic = {}
    # Make a dictionary [word, weight]
    for word, r in sorted(keywords.items(), key=lambda x: x[1],
                          reverse=True)[:30]:
        dic["y"] = r
        dic["label"] = word
        result.append(dic)
        dic = {}

    return json.dumps(result, ensure_ascii=False)
Esempio n. 6
0
def __get_keyword(splited_sentence, decay_factor, max_iteration_num):
    try:
        pick_keyword = KRWordRank(min_count=4, max_length=10, verbose=True)
        decay_factor = decay_factor  # 이 단어가 계속 선호하는 단어인지 (소멸되지 않을 확률), 보통 85%로 잡는다.
        max_iteration_num = max_iteration_num  # 최대 반복횟수
        keyword, _, _ = pick_keyword.extract(splited_sentence, decay_factor,
                                             max_iteration_num)  # 키워드 추출
    except ValueError:
        keyword = "NULL"
    finally:
        return keyword
Esempio n. 7
0
def mkKeywords(texts):

    SIZE = 5

    extractor = KRWordRank(min_count=1, max_length=120)

    keywords, rank, graph = extractor.extract(texts, beta=0.85, max_iter=30)

    outputs = []
    for w, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True):
        outputs.append(w)

    # return json.dumps( dict( zip(range(SIZE), outputs[:SIZE] ) ),ensure_ascii=False)
    return dict(zip(range(SIZE), outputs[:SIZE]))
Esempio n. 8
0
def test_keyword(test_config):
    data_path = test_config['data_path']
    with open(data_path, encoding='utf-8') as f:
        texts = [line.rsplit('\t')[0].strip() for line in f]

    wordrank_extractor = KRWordRank(min_count=5, max_length=10)
    keywords, rank, graph = wordrank_extractor.extract(texts,
                                                       beta=0.85,
                                                       max_iter=10)
    selected_keywords = [
        word for word, r in sorted(
            keywords.items(), key=lambda x: x[1], reverse=True)[:30]
    ]
    assert selected_keywords[:5] == ['영화', '너무', '정말', '음악', '마지막']
    print('\nKR-WordRank 라라랜드 영화 리뷰 30 개 키워드\n{}\n'.format(selected_keywords))
Esempio n. 9
0
    def __init__(self,
                 min_count=2,
                 max_length=10,
                 beta=0.85,
                 max_iter=10,
                 verbose=True,
                 num_words=20):
        self.min_count = min_count
        self.max_length = max_length
        self.beta = beta
        self.max_iter = max_iter
        self.verbose = verbose
        self.num_words = num_words

        self.inst = KRWordRank(min_count, max_length, self.verbose)
Esempio n. 10
0
class KeywordExtractionKorean(BaseKeywordExtraction):
    def __init__(self,
                 min_count=2,
                 max_length=10,
                 beta=0.85,
                 max_iter=10,
                 verbose=True,
                 num_words=20):
        self.min_count = min_count
        self.max_length = max_length
        self.beta = beta
        self.max_iter = max_iter
        self.verbose = verbose
        self.num_words = num_words

        self.inst = KRWordRank(min_count, max_length, self.verbose)

    def __call__(self, *args, **kwargs):
        _num_keywords = 10
        #print(str(args[0]) + "\n")
        keywords, rank, graph = self.inst.extract(args[0], self.beta,
                                                  self.max_iter,
                                                  self.num_words)

        return keywords
Esempio n. 11
0
def get_keywords(title, text):
    """

    :param title: title of article
    :param text: body of article
    :return: key_words
    """
    texts = text
    texts = [texts]
    texts = [normalize(text, english=True, number=True) for text in texts]

    wordrank_extractor = KRWordRank(
        min_count=2,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length=10,  # 단어의 최대 길이
        verbose=True)

    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10

    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

    # rank 이용 분류
    tagger = Komoran()
    stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV')])
    keyword_list = []
    for i in keywords:
        noun = tagger.nouns(i)
        if noun != []:
            keyword_list.append([noun[0], keywords[i]])

    keywords = []
    for i in keyword_list[:5]:
        keywords.append(i[0])

    title_keywords = []

    for j in keywords:
        if j in title:
            title_keywords.append(j)

    for i in title_keywords:
        if i in stopword_list:
            title_keywords.remove(i)

    return title_keywords
Esempio n. 12
0
def keyword_extract(lyrics):
    for l in lyrics:
        if len(l['lyric']) < 10:
            print(l['lyric'])
            continue
        # print(l['track_id'])

        wordrank_extractor = KRWordRank(
            min_count=5,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
            max_length=10,  # 단어의 최대 길이
            verbose=True)

        beta = 0.85  # PageRank의 decaying factor beta
        max_iter = 10

        keywords, rank, graph = wordrank_extractor.extract(
            l['lyric'], beta, max_iter)
        l['keywords'] = keywords

    return lyrics
Esempio n. 13
0
def extract_keyword_textrank(input_list):
    min_count = 5
    max_length = 10
    wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length)
    beta = 0.85
    max_iter = 10
    texts = input_list

    keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

    stop_words = {'뉴스', '기자', '기사', '평점', '주연', '방송', '편성표'}

    filtered_words = {
        word: score
        for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:100]
        if not (word in stop_words)
    }

    related_keyword = list(filtered_words.keys())

    return related_keyword[:15]
Esempio n. 14
0
def keywords_kor():

    from krwordrank.word import KRWordRank

    # wordrank-kr 활용
    min_count = 1  # 단어의 최소 출현 빈도수 (그래프 생성 시), 원래 5
    max_length = 10  # 단어의 최대 길이
    wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length)

    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10
    # # texts = ['예시 문장 입니다', '여러 문장의 list of str 입니다', ... ]
    keywords, rank, graph = wordrank_extractor.extract(
        [globalVariable.fullText_f], beta, max_iter)

    for word, r in sorted(keywords.items(), key=lambda x: x[1],
                          reverse=True)[:30]:
        globalVariable.keywordArr.append(word)
        # print('%8s:\t%.4f' % (word, r))

    globalVariable.keywordArr = globalVariable.keywordArr[:5]
Esempio n. 15
0
def kr_wordrank(load_version):
    # 1차원 리스트 구조 : 한 진료 데이터 단위
    file = open('data/revision/revision_' + load_version + '.txt',
                'r',
                encoding='utf-8',
                newline='\n')
    list_corpus = []
    for sentence in file:
        list_corpus.append(sentence.strip())

    wordrank_extractor = KRWordRank(
        min_count=5,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length=10,  # 단어의 최대 길이
        verbose=True)
    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10
    keywords, rank, graph = wordrank_extractor.extract(list_corpus, beta,
                                                       max_iter)

    for word, r in sorted(keywords.items(), key=lambda x: x[1],
                          reverse=True)[:100]:
        print('%8s:\t%.4f' % (word, r))
Esempio n. 16
0
def make_wordcloud(movie_id_list, comments_list):
    ### 단어 빈도수 keyword dict로 만들고 워드클라우드 생성 후 저장
    for idx, texts in enumerate(comments_list):
        wordrank_extractor = KRWordRank(
            min_count=2,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
            max_length=6,  # 단어의 최대 길이
            verbose=True)

        beta = 0.85  # PageRank의 decaying factor beta
        max_iter = 6
        keywords, rank, graph = wordrank_extractor.extract(
            texts, beta, max_iter)
        keywords = dict(
            sorted(keywords.items(), key=lambda x: x[1],
                   reverse=True)[:min(len(keywords.keys()), 610)])

        # 필요없는 단어 삭제
        key_set = set(keywords.keys())
        for w in exclude_set & key_set:
            keywords.pop(w)

        wordcloud = WordCloud(font_path=font_path,
                              width=900,
                              height=300,
                              background_color="white",
                              mask=mask)

        wordcloud = wordcloud.generate_from_frequencies(keywords)
        wordcloud.recolor(color_func=color_func, random_state=1)

        #     plt.figure(figsize=(10, 10))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        # plt.show()
        wordcloud.to_file(
            "movieapp/static/movieapp/img/wordcloud/{}.png".format(
                movie_id_list[idx]))
        print('{}번 완료'.format(movie_id_list[idx]))
Esempio n. 17
0
    def do_wr_keyword(self, video_name, video_description, comments, video_idx):
        min_count = 2  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length = 10  # 단어의 최대 길이
        wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length, verbose=False)

        beta = 0.85  # PageRank의 decaying factor beta
        max_iter = 10
        inputs = [video_name, video_description] + comments
        inputs = [v for v in inputs if v]

        if len(inputs) <= 3:
            print("No Korean")
            return []

        try:
            keywords, rank, graph = wordrank_extractor.extract(inputs, beta, max_iter)
        except ValueError:
            return []

        insert_list = []
        print("#### wordrank, 제목 및 설명 포함 키워드 목록 ####")
        for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True):
            if word in video_name or word in video_description:
                if self.do_sql:
                    if r > 1.0:
                        insert_list.append(f"({video_idx}, '{word[:99]}'),")
                else:
                    print("%8s:\t%.4f" % (word, r))

        print("#### wordrank, 전체 키워드 목록 ####")

        for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]:
            if self.do_sql:
                insert_list.append(f"({video_idx}, '{word[:99]}'),")
            else:
                print("%8s:\t%.4f" % (word, r))

        return insert_list
Esempio n. 18
0
def extract_krwordrank(normalized_lines, noun_count_df):
    import pandas as pd

    print("extract_krwordrank")
    from krwordrank.word import KRWordRank

    wordrank_extractor = KRWordRank(
        min_count=min(noun_count_df["count"]),  # 단어의 최소 출현 빈도수
        max_length=max(noun_count_df["tag"].str.len()),  # 단어의 최대 길이
        verbose=True)

    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10

    #keywords는 filtering이 적용된 L parts
    #rank는 substriing graph의 substring에 대한 구마
    #graph는 substring graph
    keywords, rank, graph = wordrank_extractor.extract(normalized_lines, beta,
                                                       max_iter)

    keyword_df = pd.DataFrame(list(keywords.items()), columns=['word', 'rank'])

    return keyword_df
Esempio n. 19
0
def summarize(text, num_summaries, summary_ratio=0.2):
    text_split = text.split('. ')
    if len(text_split) < num_summaries:
        return 'SummarizationError: Number of sentences must be bigger than num_summaries'

    wordrank_extractor = KRWordRank(min_count=3,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
                                    max_length=20,  # 단어의 최대 길이
                                    verbose=False)
    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10
    try:
        keywords, rank, graph = wordrank_extractor.extract(text_split, beta, max_iter, num_keywords=100)

        stopwords = set.union(get_stopwords(), set(stopwords_ko))
        vocab_score = make_vocab_score(keywords, stopwords, scaling=lambda x: 1)
        tokenizer = MaxScoreTokenizer(vocab_score)
        # tokenizer_mecab = Mecab()

        # 일정 길이 이상이 될 때까지 요약 반복
        text_summary = ""
        iter_num = 1
        while len(text_summary) < len(text) * summary_ratio:
            if len(text_split) < num_summaries * iter_num:
                break

            sents = keysentence(vocab_score,
                                text_split,
                                tokenizer.tokenize,  #tokenizer_mecab.nouns
                                diversity=0.7,
                                topk=num_summaries * iter_num)
            text_summary = '. '.join(sents)
            iter_num += 1

    except ValueError as e:
        return "SummarizationError: " + str(e)

    return text_summary
Esempio n. 20
0
def get_key_sentences(text, num):
    import re

    print(text)
    sentences = re.split("[.?!]", text)
    sentences = [s.strip() for s in sentences]
    if len(sentences) < num:
        return [
            "KeySentencesError: Number of sentences must be bigger than num"
        ]

    wordrank_extractor = KRWordRank(
        min_count=3,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length=20,  # 단어의 최대 길이
        verbose=False)
    beta = 0.85  # PageRank의 decaying factor beta
    max_iter = 10
    try:
        keywords, rank, graph = wordrank_extractor.extract(sentences,
                                                           beta,
                                                           max_iter,
                                                           num_keywords=100)

        stopwords = get_stopwords()
        vocab_score = make_vocab_score(
            keywords, stopwords, scaling=lambda x: 1)  # scaling=lambda x: 1
        tokenizer = MaxScoreTokenizer(vocab_score)

        sents = keysentence(
            vocab_score,
            sentences,
            tokenizer.tokenize,  # tokenizer_mecab.nouns
            diversity=0.6,
            topk=num)
        return sents
    except ValueError as e:
        return ["KeySentencesError: " + str(e)]
Esempio n. 21
0
def Keyword(texts):
    SIZE = 5
    extractor = KRWordRank(min_count=1, max_length=120)
    keywords, rank, graph = extractor.extract(texts, beta=0.85, max_iter=30)

    okt = Okt()

    wanted_pos = [
        'Noun',
    ]

    outputs = []
    for w, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True):
        pos = [n[0] for n in okt.pos(w) if n[1] in wanted_pos]
        outputs.extend(pos)

    print(outputs[:5])

    jsonData = json.dumps(dict(zip(range(SIZE), outputs[:SIZE])),
                          ensure_ascii=False)
    # SIZE = 5
    # with open("tagsOfArticle.json", "w") as f:
    #     json.dump( dict( zip(range(SIZE), outputs[:SIZE] ) ), f, ensure_ascii=False)
    return jsonData
# -*- coding: utf-8 -*-
# soynlp 테스트용
from krwordrank.word import KRWordRank
import json
from docx2python import docx2python
import numpy as np
from krwordrank.word import summarize_with_keywords

min_count = 5  # 단어의 최소 출현 빈도수 (그래프 생성 시)
max_length = 10  # 단어의 최대 길이
wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length)


def json_2_list_Contents(json_path):
    temp_list = []
    with open(json_path, 'r', encoding='UTF8') as data_file:
        json_data = json.load(data_file)
        for w in json_data:
            if (str(type(w['content'][0])) == "<class 'list'>"):
                # print(type(w['content'][0]))
                # print(w['content'])
                for wl in w['content']:
                    temp_list.extend(wl)
                # temp = np.array(w['content']).flatten().tolist()
                # temp_list.extend(temp)

            else:
                # print(type(w['content'][0]))
                # print(w['content'])
                temp_list.extend(w['content'])
        return temp_list
Esempio n. 23
0
def get_texts_scores(fname):
    with open(fname, encoding='utf-8') as f:
        docs = [doc.split('.') for doc in f]
        if not docs:
            return []


#         texts = zip(*docs)
#         print(docs)
        return docs.pop()

texts = get_texts_scores(fname)
print(texts)

import sys
sys.path.append('../')
from krwordrank.word import KRWordRank
from krwordrank.hangle import normalize
import krwordrank

wordrank_extractor = KRWordRank(
    min_count=3,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length=8,  # 단어의 최대 길이
    verbose=True)

beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10

keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))
Esempio n. 24
0
#명사 중 가장 긴 단어의 문자열 길이
print("최장 단어 길이 ", max(noun_count_df["tag"].str.len()))
#명사 중 최소 출현 빈도 수
print("최소 출현 빈도 수 ", min(noun_count_df["count"]))

# In[12]:

frequency_noun_list = noun_count_df[noun_count_df["count"] > 0]["tag"].tolist()

# In[13]:

from krwordrank.word import KRWordRank

wordrank_extractor = KRWordRank(
    min_count=min(noun_count_df["count"]),  # 단어의 최소 출현 빈도수
    max_length=max(noun_count_df["tag"].str.len()),  # 단어의 최대 길이
    verbose=True)

beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10

#keywords는 filtering이 적용된 L parts
#rank는 substriing graph의 substring에 대한 구마
#graph는 substring graph
keywords, rank, graph = wordrank_extractor.extract(normalized_lines, beta,
                                                   max_iter)

# In[14]:

keyword_df = pd.DataFrame(list(keywords.items()), columns=['word', 'rank'])
Esempio n. 25
0
                                        i)  # 크롤링 해올 게시판의 번호
    for article_dict in articles:
        texts.append(article_dict['article']['title'])
        texts.append(article_dict['article']['text'])
        for comment in article_dict['comments']:
            comment_text = comment['text']
            if comment_text == "삭제된 댓글입니다.":
                continue
            else:
                texts.append(delete_word(comment_text))
    time.sleep(1)

print(texts)

wordrank_extractor = KRWordRank(
    min_count=3,  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length=10,  # 단어의 최대 길이
    verbose=True)

beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10

# keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
keywords = summarize_with_keywords(texts)
for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))

krwordrank_cloud = WordCloud(font_path=font_path,
                             width=800,
                             height=800,
                             background_color="white")
popstrings = "내가 그냥 근데 너무"
Esempio n. 26
0
    "가",
]

min_count = 5  # 단어의 최소 출현 빈도수 (그래프 생성 시)
max_length = 10  # 단어의 최대 길이
beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10
verbose = True

#키워드 하나씩만 추출.
for false_tag in false_tags:
    doc_id = false_tag[0]  #도큐먼트 id 추출
    content = false_tag[1]
    sentences = kkma.sentences(content)  #추출된 내용을 문장별로 나눔.

    nouns = []
    for sentence in sentences:
        if sentence is not '':
            nouns.append(' '.join([
                noun for noun in kkma.nouns(str(sentence))
                if noun not in stopwords and len(noun) > 1
            ]))

    wordrank_extractor = KRWordRank(min_count, max_length)
    keywords, rank, graph = wordrank_extractor.extract(nouns, beta, max_iter,
                                                       verbose)
    for word, r in sorted(keywords.items(), key=lambda x: x[1],
                          reverse=True)[:30]:
        print(doc_id)
        print('%8s:\t%.4f' % (word, r))
Esempio n. 27
0
import os, sys

cur_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(cur_dir)
os.chdir('..')
os.chdir('./public')

sentence_pattern = re.compile('\n+|[.?!]')

df = pd.read_csv('./contents.csv')
data = df[['title', 'body']].agg('\n'.join, axis=1)
split_data = [sentence_pattern.split(row) for row in data]

min_count = 4  # 단어의 최소 출현 빈도수 (그래프 생성 시)
max_length = 10  # 단어의 최대 길이
wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length)

beta = 0.85  # PageRank의 decaying factor beta
max_iter = 10
verbose = True

df.tagList = df.tagList.astype(str)
for i, row in enumerate(split_data):
    try:
        keywords, rank, graph = wordrank_extractor.extract(row, beta, max_iter)
        print(
            f'[success] index: {i}, len: {len(row)}, keywords: {tuple(keywords.keys())}'
        )
        df._set_value(i, 'tagList', ' '.join(list(keywords.keys())))
    except:
        print(f'[fail] index: {i}, len: {len(row)}')
Esempio n. 28
0
keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]:
    print('%8s:\t%.4f' % (word, r))

'''
'''
from krwordrank.sentence import summarize_with_sentences

#keywords, sents = summarize_with_sentences(texts, num_keywords=100, num_keysents=10)

penalty=lambda x:0 if (15 <= len(x) <= 80) else 1

keywords, sents = summarize_with_sentences(
    texts, penalty=penalty,
    diversity=0.5,
    num_keywords=100,
    num_keysents=2,
    verbose=False
)
print(keywords)
print(sents)
'''

from krwordrank.word import KRWordRank

#texts = [] # Comments about 'La La Land (2016)'
wordrank_extractor = KRWordRank(min_count=5, max_length=10)
keywords, rank, graph = wordrank_extractor.extract(texts, num_keywords=20)
print(keywords)
Esempio n. 29
0
# La La Land
fname = './data_analysis/fail_all.txt'
# fname = './data_analysis/pass_all.txt'
# texts, scores = get_texts_scores(fname)
texts = get_texts(fname)

from krwordrank.word import KRWordRank
import krwordrank
print(krwordrank.__version__)

# train KR-WordRank model

wordrank_extractor = KRWordRank(
    min_count = 1, # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length = 10, # 단어의 최대 길이
    verbose = True
    )

beta = 0.85    # PageRank의 decaying factor beta
max_iter = 10

keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

# Check top 30 keywords with corresponding score

for word, r in sorted(keywords.items(), key=lambda x:-x[1])[:30]:
    print('%8s:\t%.4f' % (word, r))

# remove stopwords
Esempio n. 30
0
def insert_keyword():
    mongoDB = myMongoDB("CapstoneTest")

    okt = Okt()
    min_count = 1  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length = 10  # 단어의 최대 길이
    string_idx = 0
    total_clean_sentence = []
    string_id = []

    stop_words = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람',
        '주', '섯알', '가운데', '보이', '아니', '등', '같', '우리', '때', '년', '가', '한', '지',
        '대하', '오', '말', '일', '김재', '종', '매사', '스스로', '하자', '그렇', '위하', '대한',
        '확', '관련', '이상', '미만', '경우', '텔레', '다시', '때문', '대규모', '뭔가', '디섐보',
        '퍼터', '제대로', '관', '지난', '비준', '지난해', '위해', '곳곳', '현재', '당일', '주요',
        '일대', '기', '날', '코로', '물이', '간사', '요즘', '거기', '내', '지금', '정도', '이번',
        '처음', '모두', '통해', '더욱', '앞서', '진짜', '거', '올레', '가가', '해도', '한번', '원래',
        '사실', '옆', '정말', '올해', '스', '민', '초', '최근', '앞', '역시', '이후', '군', '먼저',
        '노', '해당', '최고', '가장', '중', '양', '대해', '사이', '얼마', '아주', '대비', '셈',
        '각국', '실거주', '실수요자', '실', '대부분', '섯알', '셀', '내년', '유독', '언제', '문득',
        '늘', '다른', '동안', '덩', '역시', '당시', '최', '변', '살', '이번', '씨', '랄라블',
        '점차', '건수', '번', '쥴', '리', '상대로', '송', '이제', '매년', '곳', '오늘', '듯',
        '아무', '괜', '하나', '차지', '오히려', '순간', '속', '누군가', '밥주', '스마', '문하', '정유',
        '주얼', '좀더', '먼저', '디섐보', '일주', '것처', '에브리'
        '이전', '비대', '각종', '임', '누구', '일일', '필', '부', '트럼', '초등학', '이하', '에브리'
    ]

    for content in mongoDB.collected.find({}, {"_id": 1, "content": 1}):
        cleaned_sentence = []
        clean_sentence = []
        string_id.append(list(content.values())[0])
        string = list(content.values())[1]
        string = string.replace(u'\xa0', u' ')
        string = string.replace(u'\n', u' ')
        string = string.replace(u'\r', u' ')
        clean_sentence.append(sent_tokenize(string))
        for i in clean_sentence:
            for j in i:
                cleaned_sentence.append(j)
            total_clean_sentence.append(cleaned_sentence)

    for clean_sentence in total_clean_sentence:
        noun_keyword_list = []
        stop_keyword_list = []
        keyword_list = []
        wordrank_extractor = KRWordRank(min_count=min_count,
                                        max_length=max_length)
        beta = 0.85
        max_iter = 10

        try:
            keywords, rank, graph = wordrank_extractor.extract(
                clean_sentence, beta, max_iter)
        except ValueError:
            mongoDB.collected.update_one({'_id': string_id[string_idx]},
                                         {'$set': {
                                             'keyword': 'keywords'
                                         }})
            string_idx += 1
            continue

        for word, r in sorted(keywords.items(),
                              key=lambda x: x[1],
                              reverse=True)[:]:
            keyword_list.append(word)
        for i in keyword_list:
            a = okt.pos(i)
            if a[0][1] == 'Noun':
                noun_keyword_list.append(a[0][0])

        for i in noun_keyword_list:
            if i not in stop_words:
                stop_keyword_list.append(i)
        if len(stop_keyword_list) == 0:
            stop_keyword_list.append('')

        s1 = set(stop_keyword_list)
        s1_list = list(s1)
        s2_list = s1_list[:5]

        mongoDB.collected.update_one(
            {'_id': string_id[string_idx]},
            {'$set': {
                'keyword': s1_list,
                'point_keyword': s2_list
            }})
        string_idx += 1
from krwordrank.word import KRWordRank
from krwordrank.sentence import make_vocab_score
from krwordrank.sentence import MaxScoreTokenizer
from krwordrank.sentence import keysentence
# 분석하고자 하는 텍스트 읽기
fileName = 'kor_input.txt'
texts = []
with open(fileName, encoding='utf-8-sig') as file:
    for line in file:
        texts.append(line.split(',')[-1].rstrip())  # 텍스트 구조에 따라 달라집니다.

# 키워드 학습
wordrank_extractor = KRWordRank(
    min_count=5,  # 단어의 최소 출현 빈도수
    max_length=10,  # 단어의 최대길이
    verbose = True
)
beta = 0.85
max_iter = 10

keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter, num_keywords=100)

# 단어 출력부분
with open('kor_word_output.txt',mode='w',encoding='utf-8-sig') as file:
    for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:10]:
        file.write('%8s:\t%.4f\n' % (word, r))

stopwords = {}  # ?
vocab_score = make_vocab_score(keywords, stopwords, scaling=lambda x : 1)
tokenizer = MaxScoreTokenizer(vocab_score)  # 문장 내 단어 추출기