Esempio n. 1
0
 def __init__(self):
     self.okt = Okt()
Esempio n. 2
0
m += str(now.month)
if (len(m) < 2):
    m = "0" + m
date += m
date += str(now.day)
print("date ", date)

url = 'https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=pnt&date=' + date

driver = webdriver.Chrome('C:\chromedriver')  ## 경로
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
#--------------------------현재 상영 페이지--------------------------#

twitter = Okt()

arr_rank_name = []
arr_rank_img = []
arr_rank_smr = []
arr_rank_words = []
arr_rank_score = []
arr_rank_director = []
arr_rank_actors = []


## 영화 페이지를 들어가서 정보 추출
def get_inform():

    ## 이름 뽑기
    name = ""
Esempio n. 3
0
import codecs
from konlpy.tag import Okt
from gensim.models import word2vec

fp = codecs.open('hong.txt', 'r', encoding='utf-8')
text = fp.read()

twitter = Okt()
results = []
lines = text.split('\r\n')
for line in lines:
    # 형태소 분석하기
    # 단어의 기본형 사용
    malist = twitter.pos(line, norm=True, stem=True)
    r = []
    for word in malist:
        # 어미/조사/구두점 등은 대상에서 제외
        if not word[1] in ['Josa', 'Eomi', 'Punctuation']:
            r.append(word[0])
    r1 = (' '.join(r)).strip()
    results.append(r1)
    print(r1)
# 파일로 출력하기
wakati_file = 'hong.wakati'
with open(wakati_file, 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(results))
# Word2Vec 모델 만들기
data = word2vec.LineSentence(wakati_file)
model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1)
model.save('hong.model')
print('\n\n========================== 분석 완료 =============================')
Esempio n. 4
0
        )[0].text
        contents += content_5
        content_6 = soup.select(
            'div[id=sub01_02_pop02_tab02] > div > ul > li:nth-of-type(5) > p'
        )[0].text
        contents += content_6
        content_7 = soup.select(
            'div[id=sub01_02_pop02_tab02] > div > ul > li:nth-of-type(6) > p'
        )[0].text
        contents += content_7
        content_8 = soup.select(
            'div[id=sub01_02_pop02_tab02] > div > ul > li:nth-of-type(7) > p'
        )[0].text
        contents += content_8

    okt = Okt()
    text_data = []
    ex_nouns = okt.nouns(contents)
    print(contents)

    for text in ex_nouns:
        if text != '및' and text != '를' and text != '함' and text != '등':
            text_data.append(text)

    word_count = {}

    for noun in text_data:
        word_count[noun] = word_count.get(noun, 0) + 1

    counter = Counter(word_count)
    top10 = counter.most_common(10)
Esempio n. 5
0
class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.okt = Okt()
        # stopwords = 의미 없는 단어들
        self.stopwords = [
            '중인',
            '만큼',
            '마찬가지',
            '꼬집었',
            "연합 뉴스",
            "데일리",
            "동아일보",
            "중앙일보",
            "조선일보",
            "기자",
            "아",
            "휴",
            "아이구",
            "아이쿠",
            "아이고",
            "어",
            "나",
            "연합",
            "자료사진",
            "우리",
            "저희",
            "따라",
            "의해",
            "을",
            "를",
            "에",
            "의",
            "가",
        ]

    def url2sentences(self, url):
        article = Article(url, language='ko')
        article.download()
        article.parse()
        sentences = self.kkma.sentences(article.text)

        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx - 1] += (' ' + sentences[idx])
                sentences[idx] = ''

        return sentences

    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx - 1] += (' ' + sentences[idx])
                sentences[idx] = ''

        return sentences

    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence is not '':
                nouns.append(' '.join([
                    noun for noun in self.okt.nouns(str(sentence))
                    if noun not in self.stopwords and len(noun) > 1
                ]))

        return nouns
import pandas as pd
import numpy as np

from wordcloud import STOPWORDS
from nltk.tokenize import TreebankWordTokenizer
from konlpy.tag import Komoran, Kkma, Okt, Hannanum, Mecab, Twitter

# nltk.download('punkt')

frame_list = []
lineSentence_list = []
token_set = set()

myframe = pd.read_csv('review_concat_csv02.csv', index_col=0)
imo_pattern = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
okt = Okt()
hangul_stopword = [
    x.strip()
    for x in open('hangle_stop_word.txt', 'r', encoding='utf-8').readlines()
]
# nltk.download('punkt')
total_len = 0
for idx in range(42995):
    # 문장 분류 리스트
    text = myframe['text'][idx]
    review_id = idx
    sent_list = sent_tokenize(text)
    # 문장 만다 단어 토큰화
    one_review_sentence_list = []
    for sent in sent_list:
        # 토큰화 불가능한 이모티콘 제거
Esempio n. 7
0
def draw_networkx(tmp_content):
    f = open('./stop_words.txt', 'r')
    stop_words = f.readlines()
    f.close()
    pattern2 = '[\n]'
    stop_words = list(
        map(lambda x: re.sub(pattern=pattern2, repl='', string=stop_words[x]),
            range(0, len(stop_words))))
    #print(stop_words)

    okt = Okt()
    tmp_nouns = []
    for i in range(0, np.shape(tmp_content)[0]):
        tmp_nouns.append(okt.nouns(tmp_content.content[i]))
    tmp_noun_list = []
    for sent in tmp_nouns:
        sent_noun = []
        for w in sent:
            if w not in stop_words:
                sent_noun.append(w)
        tmp_noun_list.append(sent_noun)
    bigram = Phrases(tmp_noun_list, min_count=1, threshold=10)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    docs = []
    for i in tmp_noun_list:
        tmp_i_ = bigram_mod[i]
        docs.append(tmp_i_)
    noun_list = list(itertools.chain.from_iterable(docs))
    noun_cnt = Counter(noun_list)
    noun_df = pd.DataFrame.from_dict(
        (dict(noun_cnt)), orient='index').reset_index().rename(columns={
            'index': 'words',
            0: 'freq'
        })
    noun_df = noun_df.groupby(['words']).sum().reset_index()
    uniq_noun = Counter(dict(zip(noun_df['words'], noun_df['freq'])))
    uniq_key = list(uniq_noun.keys())
    noun_index = {noun: i for i, noun in enumerate(uniq_noun)}
    occurs = np.zeros([len(docs), len(uniq_noun)])

    for i, sent in enumerate(docs):
        for w in sent:
            index = noun_index[w]
            occurs[i][index] = 1
    co_occur = occurs.T.dot(occurs)

    G1 = nx.Graph()
    tmp_G1_node = []
    for i in uniq_noun.keys():
        if uniq_noun[i] > 25:
            G1.add_node(i, size=uniq_noun[i])
            tmp_G1_node.append(i)
    for i in tmp_G1_node:  #G1.nodes():
        ind_ = noun_index[i]
        for j in range(0, occurs.shape[1]):
            if ind_ != j and co_occur[ind_][j] > 5:
                G1.add_edge(uniq_key[ind_],
                            uniq_key[j],
                            weight=co_occur[ind_][j])

    pos_ = nx.kamada_kawai_layout(G1)
    edge_trace = []
    for edge in G1.edges():
        char_1 = edge[0]
        char_2 = edge[1]
        x0, y0 = pos_[char_1]
        x1, y1 = pos_[char_2]
        text = str(char_1) + '--' + str(char_2) + ': ' + str(
            G1.edges()[edge]['weight'])
        trace = make_edge([x0, x1, None], [y0, y1, None], text,
                          width=1)  #0.3*G.edges()[edge]['weight']**1)
        edge_trace.append(trace)
    node_trace = go.Scatter(x=[],
                            y=[],
                            text=[],
                            textposition="top center",
                            textfont_size=10,
                            mode='markers+text',
                            hoverinfo='none',
                            marker=dict(color=[], size=[], line=None))
    for node in G1.nodes():
        x, y = pos_[node]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        node_trace['marker']['color'] += tuple(['#0000FF'])
        if 'size' in G1.nodes()[node]:
            node_trace['marker']['size'] += tuple([G1.nodes()[node]['size']])
        else:
            node_trace['marker']['size'] += tuple([0])
        node_trace['text'] += tuple(['<b>' + node + '</b>'])

    return node_trace, edge_trace
Esempio n. 8
0
from konlpy.tag import Okt
import numpy as np
import pandas as pd
import openpyxl
#먼저 data를 구두점, 외국어 한자 및 기타기호, 조사 제거하는 코드 입니다.

news_data = pd.read_excel("./all_necessary_xlsx", sep='delimiter')
np_news_data = np.array(news_data)

okt = Okt()
wb = openpyxl.Workbook()

sheet = wb.active

n_contain = ["Punctuation", "Foreign", "Josa"]
for i in range(np_news_data.shape[0]):
    title = okt.pos(np_news_data[i][3])
    content = okt.pos(np_news_data[i][4])
    temptitle = []
    tempcontent = []
    for word, tag in title:
        if tag not in n_contain:
            temptitle.append(word)
            temptitle.append(" ")

    for word, tag in content:
        if tag not in n_contain:
            tempcontent.append(word)
            tempcontent.append(" ")

    t = "".join(temptitle)
    # print(a_url)
    source_article = urllib.request.urlopen((a_url))
    soup = BeautifulSoup(source_article, 'lxml', from_encoding='utf-8')

    contents = soup.select('div.article_txt')
    for imsi in contents:
        item = str(imsi.find_all(text=True))
        # print(item)
        msg = msg + item

print(msg)

from konlpy.tag import Okt
from collections import Counter

nlp = Okt()
nouns = nlp.nouns(msg)
result = []
for imsi in nouns:
    if len(imsi) > 1:
        result.append(imsi)
print(result)
count = Counter(result)
print(count)

tag = count.most_common(50)  # 상위 50개 단어만 워드 클라우드에 참여시킨다

import pytagcloud

taglist = pytagcloud.make_tags(tag, maxsize=100)
print(taglist[:10])
Esempio n. 10
0
UNK = "[UNK]"
CLS = "[CLS]"
MASK = "[MASK]"
SEP = "[SEP]"
SEG_A = "[SEG_A]"
SEG_B = "[SEG_B]"
NUM = "<num>"

padding_token = PAD
cls_token = CLS
sep_token = SEP

special_tokens = [PAD, START_TOKEN, END_TOKEN, UNK, CLS, MASK, SEP, SEG_A, SEG_B, NUM]  

# 토큰화 시킬 라이브러리 함수 설정
split_fn = Okt().morphs

# 라벨 데이터 원핫인코딩 위한 config
ner_tag_size = 0

# 불러온 데이터 전처리 결과 리스트 변수
list_of_total_source_no, list_of_total_source_str, list_of_total_target_str = [], [], []


# 전처리 리스트 변수
list_of_X_tokens = []
list_of_padded_X_token_ids_with_cls_sep = []
list_of_padded_ner_ids_with_cls_sep = []

# padding congif
maxlen = 0
Esempio n. 11
0
import codecs
from konlpy.tag import Okt

fp = codecs.open('test.txt', 'r', encoding='utf-8')
text = fp.read()
# 텍스트를 한 줄씩 처리하기
twitter = Okt()
word_dic = {}
lines = text.split('\r\n')
for line in lines:
    malist = twitter.pos(line)
    for word in malist:
        if word[1] == 'Noun':
            if not (word[0] in word_dic):
                word_dic[word[0]] = 0
            word_dic[word[0]] += 1 #카운트하기
# 많이 사용된 명사 출력하기
keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True)
for word, count in keys[:50]:
    print('{0}({1}) '.format(word, count), end='')
print()
Esempio n. 12
0
def main(args):
    os.mkdirs(args.save_path)
    df = pd.read_csv(args.data_path)
    df = df.iloc[:, 1:3]
    df = df.dropna()

    X = []
    for x in df['Comment']:
        X.append([x])

    y = []
    for i in df['Label']:
        y.append(i)

    train_X, test_X, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # remove emojis
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)

    # remove stopwords 
    han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d]')



    train_X = preprocess(train_X)
    test_X = preprocess(test_X)

    stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
    okt = Okt()
    X_train = []
    for sentence in train_X:
        temp_X = []
        temp_X = okt.morphs(sentence[0], stem=True) # tokenize
        temp_X = [word for word in temp_X if not word in stopwords] # remove stopwords 
        print(temp_X)
        X_train.append(temp_X)

    X_test = []
    for sentence in test_X:
        temp_X = []
        temp_X = okt.morphs(sentence[0], stem=True) # 토큰화
        temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
        X_test.append(temp_X)

    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)

    # ============== Encoding =============== 

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)

    threshold = 3
    total_cnt = len(tokenizer.word_index) # number of words 
    rare_cnt = 0 # cound number of words with frequency less than threshold 
    total_freq = 0 # sum of all word frequency of train data
    rare_freq = 0 # sum of all word frequency of wors with frequency less than the threshold  

    # accept pair of word and frequency as key and value 
    for key, value in tokenizer.word_counts.items():
        total_freq = total_freq + value

        # if word frequency is less than threshold 
        if(value < threshold):
            rare_cnt = rare_cnt + 1
            rare_freq = rare_freq + value

    print('Size of vocabulary :',total_cnt)
    print('Words with frequency less than threshold %s: %s'%(threshold - 1, rare_cnt))
    print("Percentage of rare words:", (rare_cnt / total_cnt)*100)
    print("Percentage of frequency of rare words:", (rare_freq / total_freq)*100)

    # Remove words with freqeuncy less than 2 
    vocab_size = total_cnt - rare_cnt + 2
    print('Size of vocabulary:',vocab_size)

    tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)

    # Remove empty sample
    drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]
    X_train = np.delete(X_train, drop_train, axis=0)
    y_train = np.delete(y_train, drop_train, axis=0)
    print(len(X_train))
    print(len(y_train))

    # Padding
    print('Length of longest comment :',max(len(l) for l in X_train))
    print('Length of average comment :',sum(map(len, X_train))/len(X_train))
    plt.hist([len(s) for s in X_train], bins=50)
    plt.xlabel('length of samples')
    plt.ylabel('number of samples')
    plt.show()

    max_len = 30
    below_threshold_len(max_len, X_train)
    X_train = pad_sequences(X_train, maxlen = max_len)
    X_test = pad_sequences(X_test, maxlen = max_len)

    # =============== Train model ================

    # define model
    model = Sequential()
    model.add(Embedding(vocab_size, 100))
    model.add(LSTM(128))
    model.add(Dense(3, activation='softmax'))

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
    mc = ModelCheckpoint(os.path.join(args.save_path, 'best_model_raw_v2.h5'), monitor='val_acc', mode='max', verbose=1, save_best_only=True)

    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])
    history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

    # ============== Save results ==================
    plt.plot(history.history['recall'], label = 'recall')
    plt.plot(history.history['acc'], label = 'accuracy')
    plt.plot(history.history['precision'], label = 'precision')
    plt.legend(['recall','accuracy','precision'])
    plt.title('LSTM metrics')
    plt.savefig(os.path.join(args.save_path, 'LSTM metrices'))

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['loss', 'validation loss'])
    plt.title('LSTM loss')
    plt.savefig(os.path.join(args.save_path, 'loss'))

    plt.plot(history.history['val_recall'])
    plt.plot(history.history['val_acc'])
    plt.plot(history.history['val_precision'])
    plt.legend(['recall','accuracy','precision'])
    plt.title('LSTM validation metrics')
    plt.savefig(os.path.join(args.save_path, 'LSTM validation metrics'))
Esempio n. 13
0
 def __init__(self):
     self.texts = []
     self.tokens = []
     self.okt = Okt()
     self.stopwords = []
     self.freqtxt = []
Esempio n. 14
0
class Service:
    def __init__(self):
        self.texts = []
        self.tokens = []
        self.okt = Okt()
        self.stopwords = []
        self.freqtxt = []
###########################################################################################
    def extract_token(self, payload):
        print('>>> text 문서에서 token 추출')
        filename = payload.context + payload.fname
        with open(filename, 'r', encoding='utf-8') as f:
            self.texts = f.read()
        print(f'{self.texts[:300]}')

    def extract_hanguel(self):
        print('>>> 한글만 추출')
        texts = self.texts.replace('\n', ' ')
        tokenizer = re.compile(r'[^ ㄱ-힣]')
        self.texts = tokenizer.sub('', texts)
        print(f'{self.texts[:300]}')

    def conversion_token(self):
        print('>>> 토큰으로 변환')
        self.tokens = word_tokenize(self.texts)
        print(f'{self.tokens[:300]}')

    def compound_noun(self):
        print('>>> 복합명사는 묶어서 fitering 으로 출력')
        print('>>> ex) 삼성전자의 스마트폰은 --> 삼성전자 스마트폰')
        noun_token = []
        for token in self.tokens:
            token_pos = self.okt.pos(token)
            temp = [txt_tag[0] for txt_tag in token_pos
                    if txt_tag[1] == 'Noun']
            if len("".join(temp)) > 1:
                noun_token.append("".join(temp))
        self.texts = " ".join(noun_token)
        print(f'{self.texts[:300]}')

    def extract_stopword(self, payload):
        print('>>> text 문서에서 token 추출')
        filename = payload.context + payload.fname
        with open(filename, 'r', encoding='utf-8') as f:
            self.stopwords = f.read()
        self.stopwords = self.stopwords.split(' ')
        print(f'{self.stopwords[:10]}')

    def filtering_text_with_stopword(self):
        print('>>> stopword 로 필터링 ')
        self.texts = word_tokenize(self.texts)
        self.texts = [text for text in self.texts
                      if text not in self.stopwords]
    def frequent_text(self):
        print('>>> 빈도수로 정렬 ')
        self.freqtxt = pd.Series(dict(FreqDist(self.texts)))\
            .sort_values(ascending=False)
        print(f'{self.freqtxt[:10]}')

    def draw_wordcloud(self, payload):
        print('>>> 워드크라우드 작성 ')
        filename = payload.context + payload.fname
        wcloud = WordCloud(filename,
                           relative_scaling=0.2,
                           background_color='white').generate(" ".join(self.texts))
        plt.figure(figsize=(12,12))
        plt.imshow(wcloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()
###########################################################################################
    # 문장을 넣어서 단어와 품사를 튜플로 만들어 리스트 형태로 반환
    @staticmethod
    def sentence_pos(sentence):
        print('# before user dic')
        komo = Komoran()
        result = komo.pos(sentence)
        print('전체 확인하기')
        for myitem in result:
            somedata = '단어 : %s, 품사 : %s' % (myitem[0], myitem[1])
            print(somedata)
        print('-' * 30)
        return result

    '''
    리턴값 # before user dic
    [('국정', 'NNG'), ('농', 'NNG'), 
    ('단', 'NNG'), ('태블릿 PC', 'NNP'), (',', 'SP'), 
    ('설', 'NNB'), ('진', 'NNP'), 
    ('욱', 'NA'), (',', 'SP'), ('가나', 'NNP'), ('다라', 'NNP')]
    '''

    @staticmethod
    def pos_to_noun(sentence):
        komo = Komoran()
        print('명사만 추출해보기')
        nouns = komo.nouns(sentence)
        print(nouns)
        return nouns
###########################################################################################
    # word cloud를 생성한다.
    @staticmethod
    def makeWordCloud(context,wordDict,imageFile,fontpath,filename):  # 워드 클라우드
        # 이미지를 넘파이 배열로 바꿉니다.
        
        imageFile=context+imageFile
        fontpath =context+fontpath
        filename =context+filename
        alice_coloring = np.array(Image.open(imageFile))

        wordcloud = WordCloud(font_path=fontpath, mask=alice_coloring,
                              relative_scaling=0.2, background_color='lightyellow')
        wordcloud = wordcloud.generate_from_frequencies(wordDict)
        image_colors = ImageColorGenerator(alice_coloring)
        # random_state : 랜덤 상수 지정
        newwc = wordcloud.recolor(color_func=image_colors, random_state=42)

        plt.imshow(newwc)
        plt.axis('off')
        plt.savefig(filename)
        plt.figure(figsize=(16, 8))

    @staticmethod
    def makeBarChart(context,wordlist,filename):  # 막대 그래프
        filename = context+filename
        # result를 이용하여 막대 그래프를 그려 보세요.
        result = wordlist[0:10]  # 10개 데이터

        barcount = 10  # 막대 갯수 : 10개만 그리겠다.
        xlow, xhigh = - 0.5, barcount - 0.5

        result = wordlist[:barcount]
        chartdata = []  # 차트 수치
        xdata = []  # 글씨
        mycolor = ['r', 'g', 'b', 'y', 'm', 'c', '#FFF0F0', '#CCFFBB', '#05CCFF', '#11CCFF']

        for idx in range(len(result)):
            chartdata.append(result[idx][1])
            xdata.append(result[idx][0])

            value = str(chartdata[idx]) + '건'  # 예시 : 60건
            # 그래프의 위에 "건수" 표시
            plt.text(x=idx, y=chartdata[idx] - 20, s=value, fontsize=8, horizontalalignment='center')

        plt.xticks(range(barcount), xdata, rotation=45)
        plt.bar(range(barcount), chartdata, align='center', color=mycolor)

        plt.title('상위 ' + str(barcount) + '빈도수')
        plt.xlim([xlow, xhigh])
        plt.xlabel('주요 키워드')
        plt.ylabel('빈도수')

        plt.savefig(filename, dpi=400, bbox_inches='tight')
        print(filename + ' 파일이 저장되었습니다.')

    @staticmethod
    def make_wordlist(context,txt,stopwordTxt):

        filename = context +txt
        ko_con_text = open(filename, 'rt', encoding='utf-8').read()

        okt = Okt()
        token_ko = okt.nouns(ko_con_text)

        # 불용어(stopword) : 빈도 수에 상관없이 분석에서 배제할 단어들
        stop_word_file = context +stopwordTxt
        stop_file = open(stop_word_file, 'rt', encoding='utf-8')
        stop_words = [word.strip() for word in stop_file.readlines()]
        # print(stop_words)

        token_ko = [each_word for each_word in token_ko if each_word not in stop_words]

        # nltk : national language toolkit
        # token : 작은 절편
        ko = nltk.Text(tokens=token_ko)

        wordlist = list()  # 튜플(단어, 빈도수)를 저장할 리스트
        # 가장 빈도수가 많은 500개만 추출
        data = ko.vocab().most_common(500)
        # print(data)
        for word, count in data:
            if (count >= 50 and len(word) >= 2):
                wordlist.append((word, count))
        return wordlist

    @staticmethod
    def create_word2vec(context,filename,prepro_file,model_filename):
        myencoding = 'utf-8'
        filename = context +filename
        myfile = open(filename, 'rt', encoding=myencoding)
        soup = BeautifulSoup(myfile, 'html.parser')
        mydata = soup.text
        # print(mydata)

        results = []  # 결과 저장소
        okt = Okt()

        datalines = mydata.split('\n')
        print(len(datalines))

        for oneline in datalines:
            mypos = okt.pos(oneline, norm=True, stem=True)
            # print(mypos)

            imsi = []  # 임시 리스트
            for word in mypos:
                if not word[1] in ['Josa', 'Eomi', 'Punctuation', 'Verb']:
                    if len(word[0]) >= 2:
                        imsi.append(word[0])

            temp = (' '.join(imsi)).strip()
            results.append(temp)
            # break # 차후 삭제 예정

        # print(results)

        # 정제된 파일로 저장하기
        with open(prepro_file, 'wt', encoding=myencoding) as myfile:
            myfile.write('\n'.join(results))

        print(prepro_file + ' 파일 생성됨')

        # word2vec : word(단어)들을 벡터로 만드는 알고리즘
        # vector(벡터) : 크기와 방향을 가지고 있는 단위
        # 스칼라 : only 값
        # 단어들의 유사도 : 코싸인 유사도, 유클리디언 유사도, 맨하탄 유사도

        # LineSentence : 분석을 하기 위한 sentence를 만들어 주는 함수
        data = word2vec.LineSentence(prepro_file)
        print(type(data))

        # Word2Vec : 해당 sentence를 사용하여 word2vec에 대한 모델을 생성해줍니다.
        # size : 벡터의 차원수, window : 윈도우 사이즈, min_count : 버리고자 하는 최소 빈도수
        # sg : 1(skipgram), 0(cbow)
        model = word2vec.Word2Vec(data, size=200, window=10, min_count=2, sg=1)
        print(type(model))

        # 모델을 저장할 때는 save 함수를 사용합니다.
        # 모델 파일은 바이트 형식의 파일입니다.
        model.save(model_filename)
        print(model_filename + ' 파일 생성됨')

        print('finished')
        
    @staticmethod
    def showGraph(bargraph):
        length = len(bargraph)  # 요소 갯수
        # x축에 보이는 글자
        myticks = list(mydata[0] for mydata in bargraph)
        # 그려질 수치 데이터
        chartdata = list(mydata[1] for mydata in bargraph)
        mycolor = ['b', 'g', 'r', 'c', 'm', 'y', 'k', '#56FFCC', '#00CCFF', '#CCDDEE']

        plt.figure()
        plt.barh(myticks, chartdata, color=mycolor, align='center')
        plt.yticks(range(length), myticks, rotation='10')
        plt.xlim(min(chartdata) - 0.02, max(chartdata) + 0.02)
        filename = 'word2vec_model_01.png'
        plt.savefig(filename)
        print(filename + ' 파일 저장됨')

    @staticmethod
    def makePie(piegraph):
        myticks = list(mydata[0] for mydata in piegraph)
        chartdata = list(mydata[1] for mydata in piegraph)
        mycolor = ['b', 'g', 'r', 'c', 'm']

        plt.figure()
        plt.pie(chartdata, colors=mycolor, labels=myticks, startangle=90, shadow=False,
                explode=(0, 0.05, 0, 0, 0), autopct='%1.2f%%', normalize=True)
        filename = 'word2vec_model_02.png'
        plt.savefig(filename)
        print(filename + ' 파일 저장됨')

        print('finished')
Esempio n. 15
0
from konlpy.tag import Okt
from gensim.models import Word2Vec
import gensim
import pandas as pd

okt = Okt()

df = pd.read_csv('CSV파일명.csv', encoding='utf-8')


def arrToStr(arr):
    return arr.str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", " ")


result = []
for i, rcp in enumerate(df['recipe']):
    tokenlist = okt.pos(rcp, stem=True, norm=True)
    temp = []
    for word in tokenlist:
        if word[1] in ["Noun"]:  # 명사일 때만
            temp.append((word[0]))
    temp = temp + df.loc[i, 'cleand'].split()
    result.append(temp)  # 결과에 저장
    df.loc[i, 'cleand'] = str(' '.join(temp))
    if i % 500 == 0:
        print("%d번째 While문." % i)
        print(temp)

df.to_csv('CSV파일명.csv', encoding='utf-8')
print('총 샘플의 개수 : {}'.format(len(result)))
Esempio n. 16
0
from konlpy.tag import Okt
from gensim.models import word2vec
# Word2Vec 모델 읽어 들이고 형태소 분석 준비하기
model = word2vec.Word2Vec.load("./wiki.model") 
okt = Okt()
def print_emargency(text): print(text)
  # 전달된 문장을 형태소 분석하기
  node = okt.pos(text, norm=True, stem=True)
  for word, form in node:
    # 필요한 형태소만 추출하기
    if form == 'Noun' or form == 'Verb' or form == 'Adjective' or form == 'Adverb':
      # 급하다와 비슷한 단어
      print("-", word, ":", model.wv.similarity(word, '급하다'))

print_emargency("컴퓨터에 문제가 생겼어요. 빨리 해결해야 하는 문제가 있어서 지원 요청합니다.")
print_emargency("사용 방법을 잘 모르겠습니다.")
Esempio n. 17
0
# plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["figure.figsize"] = [12, 6]

# %%

df.tail()

# %%

from konlpy.tag import Kkma

kkma = Kkma()

from konlpy.tag import Okt

okt = Okt()

df["kkma"] = ''
df["okt"] = ''

for i in range(0, len(df)):
    # df['kkma'][i] = kkma.morphs(df["질문"][i])
    # df['okt'][i] = okt.morphs(df["질문"][i])
    df['kkma'][i] = kkma.pos(df["질문"][i])
    df['okt'][i] = okt.pos(df["질문"][i])

df.head()

# %%

import re
Esempio n. 18
0
def morphs_convert():

    target = 'all'

    # -------------------------------------

    target_name_txt = './data/text/' + target + '_names_kor.txt'
    target_name = './data/hexcolor_vf/kor_' + target + '_names.pkl'
    target_name_okt = './data/hexcolor_vf/kor_' + target + '_names_okt.pkl'
    target_name_mecab = './data/hexcolor_vf/kor_' + target + '_names_mecab.pkl'

    if target == 'all':
        target_palette_name = './data/hexcolor_vf/train_palettes_rgb.pkl'
    else:
        target_palette_name = './data/hexcolor_vf/' + target + '_palettes_rgb.pkl'

    okt = Okt()
    mecab = Mecab()
    name_seqs = []
    name_seqs_m = []

    # 텍스트 파일 전처리
    txt_file = open(target_name_txt, mode='rt', encoding='utf-8')
    file = txt_file.readlines()
    file = list(map(lambda s: s.strip(), file))
    print(f'----텍스트 라인 수 : {len(file)}----')

    # 일반 텍스트 피클 파일 생성
    with open(target_name, 'wb') as pkl:
        pickle.dump(file, pkl)

    with open(target_name, "rb") as pkl:
        pkl_load = pickle.load(pkl)
        print(f'----일반 pkl 라인 수 : {len(pkl_load)}----')

    # # OKT 작업
    # for i, tmp in enumerate(file):
    #     tmp = okt.morphs(tmp)
    #     name_seqs.append(tmp)
    #
    # with open(target_name_okt,'wb') as pkl :
    #     pickle.dump(name_seqs, pkl)
    #
    # with open(target_name_okt, "rb") as pkl:
    #     pkl_load = pickle.load(pkl)
    #     print(f'----OKT pkl 라인 수 : {len(pkl_load)}----')
    #
    # # Mecab 작업
    # for j, tmp_m in enumerate(file):
    #     tmp_m = mecab.morphs(tmp_m)
    #     name_seqs_m.append(tmp_m)
    #
    # with open(target_name_mecab,'wb') as pkl :
    #     pickle.dump(name_seqs_m, pkl)
    #
    # with open(target_name_mecab, "rb") as pkl:
    #     pkl_load = pickle.load(pkl)
    #     print(f'----Mecab pkl 라인 수 : {len(pkl_load)}----')

    # 팔레트 라인과 비교
    with open(target_palette_name, 'rb') as f:
        palette_data = pickle.load(f)
        print(f'----팔레트 라인 수 : {len(palette_data)}----')
Esempio n. 19
0
        print("%d번째 뉴스기사 크롤링 실패" % (i + 1))
    else:  #가져올 내용이 있다면
        print("%d번째 뉴스기사 크롤링 성공" % (i + 1))
        #수집결과에서 불필요한 HTML 태그 제거
        for item in news_html:
            crawler.remove(item, "script")
            crawler.remove(item, "a")
            crawler.remove(item, "br")
            crawler.remove(item, "span", {"class": "end_photo_org"})

            #공백을 제거한 텍스트만 미리 준비한 변수에 누적
            news_content += item.text.strip()

#4) 수집결과를 기반으로 형태소 분석
#형태소 분석 객체를 통해 수집된 뉴스 본문에서 명사만 추출
nlp = Okt()
nouns = nlp.nouns(news_content)

#명사들에 대한 빈도수 검사
count = Counter(nouns)

#가장 많이 사용된 단어 100개 추출
most = count.most_common(100)

#추출 결과를 워드클라우드에서 요구하는 형식으로 재구성
#   → {"단어":빈도수, "단어":빈도수, ...}
tags = {}
lists = []
for n, c in most:
    if len(n) > 1:
        tags[n] = c
Esempio n. 20
0
def insert_keyword():
    mongoDB = myMongoDB("CapstoneTest")

    okt = Okt()
    min_count = 1  # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length = 10  # 단어의 최대 길이
    string_idx = 0
    total_clean_sentence = []
    string_id = []

    stop_words = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람',
        '주', '섯알', '가운데', '보이', '아니', '등', '같', '우리', '때', '년', '가', '한', '지',
        '대하', '오', '말', '일', '김재', '종', '매사', '스스로', '하자', '그렇', '위하', '대한',
        '확', '관련', '이상', '미만', '경우', '텔레', '다시', '때문', '대규모', '뭔가', '디섐보',
        '퍼터', '제대로', '관', '지난', '비준', '지난해', '위해', '곳곳', '현재', '당일', '주요',
        '일대', '기', '날', '코로', '물이', '간사', '요즘', '거기', '내', '지금', '정도', '이번',
        '처음', '모두', '통해', '더욱', '앞서', '진짜', '거', '올레', '가가', '해도', '한번', '원래',
        '사실', '옆', '정말', '올해', '스', '민', '초', '최근', '앞', '역시', '이후', '군', '먼저',
        '노', '해당', '최고', '가장', '중', '양', '대해', '사이', '얼마', '아주', '대비', '셈',
        '각국', '실거주', '실수요자', '실', '대부분', '섯알', '셀', '내년', '유독', '언제', '문득',
        '늘', '다른', '동안', '덩', '역시', '당시', '최', '변', '살', '이번', '씨', '랄라블',
        '점차', '건수', '번', '쥴', '리', '상대로', '송', '이제', '매년', '곳', '오늘', '듯',
        '아무', '괜', '하나', '차지', '오히려', '순간', '속', '누군가', '밥주', '스마', '문하', '정유',
        '주얼', '좀더', '먼저', '디섐보', '일주', '것처', '에브리'
        '이전', '비대', '각종', '임', '누구', '일일', '필', '부', '트럼', '초등학', '이하', '에브리'
    ]

    for content in mongoDB.collected.find({}, {"_id": 1, "content": 1}):
        cleaned_sentence = []
        clean_sentence = []
        string_id.append(list(content.values())[0])
        string = list(content.values())[1]
        string = string.replace(u'\xa0', u' ')
        string = string.replace(u'\n', u' ')
        string = string.replace(u'\r', u' ')
        clean_sentence.append(sent_tokenize(string))
        for i in clean_sentence:
            for j in i:
                cleaned_sentence.append(j)
            total_clean_sentence.append(cleaned_sentence)

    for clean_sentence in total_clean_sentence:
        noun_keyword_list = []
        stop_keyword_list = []
        keyword_list = []
        wordrank_extractor = KRWordRank(min_count=min_count,
                                        max_length=max_length)
        beta = 0.85
        max_iter = 10

        try:
            keywords, rank, graph = wordrank_extractor.extract(
                clean_sentence, beta, max_iter)
        except ValueError:
            mongoDB.collected.update_one({'_id': string_id[string_idx]},
                                         {'$set': {
                                             'keyword': 'keywords'
                                         }})
            string_idx += 1
            continue

        for word, r in sorted(keywords.items(),
                              key=lambda x: x[1],
                              reverse=True)[:]:
            keyword_list.append(word)
        for i in keyword_list:
            a = okt.pos(i)
            if a[0][1] == 'Noun':
                noun_keyword_list.append(a[0][0])

        for i in noun_keyword_list:
            if i not in stop_words:
                stop_keyword_list.append(i)
        if len(stop_keyword_list) == 0:
            stop_keyword_list.append('')

        s1 = set(stop_keyword_list)
        s1_list = list(s1)
        s2_list = s1_list[:5]

        mongoDB.collected.update_one(
            {'_id': string_id[string_idx]},
            {'$set': {
                'keyword': s1_list,
                'point_keyword': s2_list
            }})
        string_idx += 1

pip install konlpy


# In[1]:


from konlpy.tag import Okt #윈도우 사용가능 태깅 중 제일 빠름
from konlpy.utils import pprint


# In[2]:


okt = Okt()


# In[3]:


pprint(okt.morphs(u'장대한 생물 진화사를 치킨으로 정리해버리는 씹좆간쉨ㅋㅋㅋㅋ'))


# In[4]:


pprint(okt.nouns(u'장대한 생물 진화사를 치킨으로 정리해버리는 씹좆간쉨ㅋㅋㅋㅋ'))


# In[5]:
Esempio n. 22
0
# Corpus (말뭉치) : 언어의 표본을 담아둔 묶은(사전)
from konlpy.corpus import kobill  # 정치 관련 사전

# 파일 읽어오기 (1)
files_ko = kobill.fileids()
# print(files_ko) # 문서 확인하기.

# 파일 읽어오기 (2)
doc_ko = kobill.open(r'mynews.txt').read()
# print(doc_ko)
# 의미 단어 추출 (Tokenize)
from konlpy.tag import Okt

t = Okt()

tokens_ko = t.morphs(doc_ko)  # 문서를 token으로 분리
print(tokens_ko)

import nltk
ko = nltk.Text(tokens_ko, name='원내대표')  # 그런 단어가 있는지 검색할때 쓰는 nltk.Text

print(ko)
print('토큰 정보 확인-----')
print(len(ko.tokens))  # 608개
print(len(set(ko.tokens)))  #유니크한개 293개
fre_dist = ko.vocab()
print(fre_dist)  # <FreqDist with 293 samples and 608 outcomes>

# from matplotlib import rc
# rc('font', family='malgun gothic')
# ko.plot(50)
Esempio n. 23
0
from konlpy.tag import Okt
import re
#import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
import pandas as pd
from nltk import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np

okt = Okt()
ctx = '../data/'
filename = ctx + 'kr-Report_2018.txt'
with open(filename, 'r', encoding='utf-8') as f:
    texts = f.read()
print(texts[:300])

texts = texts.replace('\n', '')
tokenizer = re.compile('[^ ㄱ.힣]+')
texts = tokenizer.sub('', texts)
tokens = word_tokenize(texts)
noun_token = []
for token in tokens:
    token_pos = okt.pos(token)
    temp = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == "Noun"]
    if len(''.join(temp)) > 1:
        noun_token.append("".join(temp))
texts = " ".join(noun_token)

with open(ctx + 'stopwords.txt', 'r', encoding='UTF-8') as f:
Esempio n. 24
0
import unittest
from konlpy.tag import Okt
'''class MyTestCase(unittest.TestCase):
    def test(self):
        okt = Okt()
        self.assertEqual(u'잃' in okt.morphs(u'잃어버리다'), True)


if __name__ == '__main__':
    unittest.main()'''

okt = Okt()
print(
    okt.morphs(u'9 월 22 일에 아이패드를 잃어버렸어요..... 찾게 도와주세요 ㅠㅠㅠㅠㅠㅠ',
               norm=True,
               stem=True))
Esempio n. 25
0
from bs4 import BeautifulSoup as bs
from konlpy.tag import Okt
import requests

url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=161967&type=after&onlyActualPointYn=N&onlySpoilerPointYn=N&order=newest"
html = bs(requests.get(url).content, "html.parser", from_encoding="utf-8")
cnt = html.select(
    "body > div > div > div.score_total > strong > em")[0].contents[0].replace(
        ',', '')

lst = []
dict = {}

okt = Okt()  #konlpy 모듈을 호출하여 변수 okt로 설정

for x in range(1, int(cnt) // 10 + 2):  # 1페이지부터 수상일 이후의 댓글을 도출하고자함.
    html = bs(requests.get(url + "&page=" + str(x)).content,
              "html.parser",
              from_encoding="utf-8")
    size = len(
        html.select(
            "body > div > div > div.score_result > ul > li > div.score_reple > p"
        ))  # 페이지당 댓글의 수만큼 for문을 돌리도록 설정(마지막 페이지에 댓글이 10개가 아닐 수도 있기 때문)

    for i in range(1, size + 1):

        #유저의 리뷰를 단 날짜 크롤링 코드
        date = html.select(
            "body > div > div > div.score_result > ul > li:nth-of-type(" +
            str(i) +
            ") > div.score_reple > dl > dt > em:nth-of-type(2)")[0].contents[0]
Esempio n. 26
0
def _add_tokens(df):
    okt = Okt()
    df['tokens'] = df['review'].map(lambda x: _get_tokens(okt, x))
    return df
Esempio n. 27
0
import nltk, re, pprint
from nltk.tokenize import word_tokenize
from urllib import request
from bs4 import BeautifulSoup
import pickle
from urllib.parse import urlencode, quote
import konlpy
from konlpy.tag import Okt

okt = Okt()

ROOT_URL = "http://korlex.pusan.ac.kr/search/WebApplication2/KorLex_SearchPage.aspx"

easy_vocab_list = []
with open('./TOPIC_vocab_list.csv', 'r') as csvfile:
    for line in csvfile.readlines():
        array = line.split(',')
        easy = array[1].split('0')[0]
        easy = easy.split('1')[0]
        easy_vocab_list.append(easy)

text_file = open("Textbook_middle.txt", "r")
text = text_file.read()
easy_corpus_list = text.split()


def preprocess_word(word):
    return okt.pos(word, stem=True)


def score_with_easy_list(word):
Esempio n. 28
0
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from pprint import pprint
import pickle

data = pd.read_excel('./dataset.xlsx')

X = data['input']
Y = data['output']

X = X.str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 특수문자 제거
stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다'] # 불용어

okt = Okt()
x = []
for sentence in X:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    x.append(temp_X)
X = x
with open('sentences.txt', 'wb') as f:
    pickle.dump(X, f)

max_words = 3000
t = Tokenizer(num_words=max_words) # 상위 3000개의 단어만 보존
t.fit_on_texts(X)
X = t.texts_to_sequences(X)
print(X[:5])
Esempio n. 29
0
import pymysql
import jieba
import re
import konlpy
from konlpy.tag import Okt
from collections import Counter
# encoding: utf-8
conn = pymysql.connect(  # 创建数据库连接
    host='gujiakai.softether.net',  # 要连接的数据库所在主机ip
    user='******',  # 数据库登录用户名
    password='******',  # 登录用户密码
    database='library',  # 连接的数据库名,也可以后续通过cursor.execture('user test_db')指定
    charset='utf8mb4'  # 编码,注意不能写成utf-8
)
t = Okt()
cursor = conn.cursor()
cursor.execute("select question from qna")
res = cursor.fetchall()
cursor.execute("delete from wc")
cut_words = ""
res = str(res)
nouns = t.nouns(res)
cut_words = ""
for con in nouns:
    scon = str(con)
    print(scon)
    cursor.execute("INSERT INTO wc (world) VALUES('%s')" % (scon))
    conn.commit()
Esempio n. 30
0
class UserData:
    def __init__(self):
        self.okt = Okt()

    def read_file(self):
        self.okt.pos("morph", stem=True)
        cheese_data = pd.read_csv("com_cheese_api/user/data/users.csv")
        cheese_lists = list(cheese_data['cheese_name'])
        tests = ''.join(cheese_lists)
        #print(cheese_str)
        #print(type(cheese_str))
        return texts
        

    @staticmethod
    def extract_hangeul(texts):
        temp = texts.replace('\n', ' ')
        tokenizer = re.compile(r'[^ㄱ-힣]+')
        temp = tokenizer.sub('', temp)
        return temp

    @staticmethod
    def change_token(texts):
        tokens = word_tokenize(texts)
        return tokens

    def extract_noun(self):
        noun_tokens = []
        tokens = self.change_token(self.extract_hangeul(self.read_file()))
        for token in tokens:
            token_pos = self.okt.pos(token)
            temp = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == 'Noun']
            if len(''.join(temp)) > 1:
                noun_tokens.append("".join(temp))
        texts = " ".join(noun_tokens)
        return texts


    @staticmethod
    def download():
        nltk.download()

    @staticmethod
    def read_stopword():
        with open('com_cheese_api/user/data/stopword.txt', 'r') as file:
            lines = file.readlines()
            stop_str = ''.join(lines)
            stopword = stop_str.replace('\n', ' ')
            stopwords = stopword.split(' ')
        return stopwords       


    def remove_stopword(self):
        texts = self.extract_noun()
        tokens = self.change_token(texts)
        stopwords = self.read_stopword()
        texts = [text for text in tokens
                    if text not in stopwords]
        return texts


    def hook(self):
        texts = self.remove_stopword()
        freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False)
        print(freqtxt[:100])
        return freqtxt


    def draw_wordcloud(self):
        texts = self.remove_stopword()
        wcloud = WordCloud('/usr/share/fonts/truetype/nanum/NanumBarunGothicBold.ttf', background_color='white', width=800, height=600)
        cloud = wc.generate_from_frequencies(dict(tags))
        plt.figure(figsize=(10, 8))
        plt.axis('off')
        plt.imshow(cloud)
        plt.show()