def __init__(self): self.okt = Okt()
m += str(now.month) if (len(m) < 2): m = "0" + m date += m date += str(now.day) print("date ", date) url = 'https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=pnt&date=' + date driver = webdriver.Chrome('C:\chromedriver') ## 경로 driver.get(url) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') #--------------------------현재 상영 페이지--------------------------# twitter = Okt() arr_rank_name = [] arr_rank_img = [] arr_rank_smr = [] arr_rank_words = [] arr_rank_score = [] arr_rank_director = [] arr_rank_actors = [] ## 영화 페이지를 들어가서 정보 추출 def get_inform(): ## 이름 뽑기 name = ""
import codecs from konlpy.tag import Okt from gensim.models import word2vec fp = codecs.open('hong.txt', 'r', encoding='utf-8') text = fp.read() twitter = Okt() results = [] lines = text.split('\r\n') for line in lines: # 형태소 분석하기 # 단어의 기본형 사용 malist = twitter.pos(line, norm=True, stem=True) r = [] for word in malist: # 어미/조사/구두점 등은 대상에서 제외 if not word[1] in ['Josa', 'Eomi', 'Punctuation']: r.append(word[0]) r1 = (' '.join(r)).strip() results.append(r1) print(r1) # 파일로 출력하기 wakati_file = 'hong.wakati' with open(wakati_file, 'w', encoding='utf-8') as fp: fp.write('\n'.join(results)) # Word2Vec 모델 만들기 data = word2vec.LineSentence(wakati_file) model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1) model.save('hong.model') print('\n\n========================== 분석 완료 =============================')
)[0].text contents += content_5 content_6 = soup.select( 'div[id=sub01_02_pop02_tab02] > div > ul > li:nth-of-type(5) > p' )[0].text contents += content_6 content_7 = soup.select( 'div[id=sub01_02_pop02_tab02] > div > ul > li:nth-of-type(6) > p' )[0].text contents += content_7 content_8 = soup.select( 'div[id=sub01_02_pop02_tab02] > div > ul > li:nth-of-type(7) > p' )[0].text contents += content_8 okt = Okt() text_data = [] ex_nouns = okt.nouns(contents) print(contents) for text in ex_nouns: if text != '및' and text != '를' and text != '함' and text != '등': text_data.append(text) word_count = {} for noun in text_data: word_count[noun] = word_count.get(noun, 0) + 1 counter = Counter(word_count) top10 = counter.most_common(10)
class SentenceTokenizer(object): def __init__(self): self.kkma = Kkma() self.okt = Okt() # stopwords = 의미 없는 단어들 self.stopwords = [ '중인', '만큼', '마찬가지', '꼬집었', "연합 뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자", "아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "연합", "자료사진", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가", ] def url2sentences(self, url): article = Article(url, language='ko') article.download() article.parse() sentences = self.kkma.sentences(article.text) for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx - 1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences def text2sentences(self, text): sentences = self.kkma.sentences(text) for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx - 1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences def get_nouns(self, sentences): nouns = [] for sentence in sentences: if sentence is not '': nouns.append(' '.join([ noun for noun in self.okt.nouns(str(sentence)) if noun not in self.stopwords and len(noun) > 1 ])) return nouns
import pandas as pd import numpy as np from wordcloud import STOPWORDS from nltk.tokenize import TreebankWordTokenizer from konlpy.tag import Komoran, Kkma, Okt, Hannanum, Mecab, Twitter # nltk.download('punkt') frame_list = [] lineSentence_list = [] token_set = set() myframe = pd.read_csv('review_concat_csv02.csv', index_col=0) imo_pattern = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE) okt = Okt() hangul_stopword = [ x.strip() for x in open('hangle_stop_word.txt', 'r', encoding='utf-8').readlines() ] # nltk.download('punkt') total_len = 0 for idx in range(42995): # 문장 분류 리스트 text = myframe['text'][idx] review_id = idx sent_list = sent_tokenize(text) # 문장 만다 단어 토큰화 one_review_sentence_list = [] for sent in sent_list: # 토큰화 불가능한 이모티콘 제거
def draw_networkx(tmp_content): f = open('./stop_words.txt', 'r') stop_words = f.readlines() f.close() pattern2 = '[\n]' stop_words = list( map(lambda x: re.sub(pattern=pattern2, repl='', string=stop_words[x]), range(0, len(stop_words)))) #print(stop_words) okt = Okt() tmp_nouns = [] for i in range(0, np.shape(tmp_content)[0]): tmp_nouns.append(okt.nouns(tmp_content.content[i])) tmp_noun_list = [] for sent in tmp_nouns: sent_noun = [] for w in sent: if w not in stop_words: sent_noun.append(w) tmp_noun_list.append(sent_noun) bigram = Phrases(tmp_noun_list, min_count=1, threshold=10) bigram_mod = gensim.models.phrases.Phraser(bigram) docs = [] for i in tmp_noun_list: tmp_i_ = bigram_mod[i] docs.append(tmp_i_) noun_list = list(itertools.chain.from_iterable(docs)) noun_cnt = Counter(noun_list) noun_df = pd.DataFrame.from_dict( (dict(noun_cnt)), orient='index').reset_index().rename(columns={ 'index': 'words', 0: 'freq' }) noun_df = noun_df.groupby(['words']).sum().reset_index() uniq_noun = Counter(dict(zip(noun_df['words'], noun_df['freq']))) uniq_key = list(uniq_noun.keys()) noun_index = {noun: i for i, noun in enumerate(uniq_noun)} occurs = np.zeros([len(docs), len(uniq_noun)]) for i, sent in enumerate(docs): for w in sent: index = noun_index[w] occurs[i][index] = 1 co_occur = occurs.T.dot(occurs) G1 = nx.Graph() tmp_G1_node = [] for i in uniq_noun.keys(): if uniq_noun[i] > 25: G1.add_node(i, size=uniq_noun[i]) tmp_G1_node.append(i) for i in tmp_G1_node: #G1.nodes(): ind_ = noun_index[i] for j in range(0, occurs.shape[1]): if ind_ != j and co_occur[ind_][j] > 5: G1.add_edge(uniq_key[ind_], uniq_key[j], weight=co_occur[ind_][j]) pos_ = nx.kamada_kawai_layout(G1) edge_trace = [] for edge in G1.edges(): char_1 = edge[0] char_2 = edge[1] x0, y0 = pos_[char_1] x1, y1 = pos_[char_2] text = str(char_1) + '--' + str(char_2) + ': ' + str( G1.edges()[edge]['weight']) trace = make_edge([x0, x1, None], [y0, y1, None], text, width=1) #0.3*G.edges()[edge]['weight']**1) edge_trace.append(trace) node_trace = go.Scatter(x=[], y=[], text=[], textposition="top center", textfont_size=10, mode='markers+text', hoverinfo='none', marker=dict(color=[], size=[], line=None)) for node in G1.nodes(): x, y = pos_[node] node_trace['x'] += tuple([x]) node_trace['y'] += tuple([y]) node_trace['marker']['color'] += tuple(['#0000FF']) if 'size' in G1.nodes()[node]: node_trace['marker']['size'] += tuple([G1.nodes()[node]['size']]) else: node_trace['marker']['size'] += tuple([0]) node_trace['text'] += tuple(['<b>' + node + '</b>']) return node_trace, edge_trace
from konlpy.tag import Okt import numpy as np import pandas as pd import openpyxl #먼저 data를 구두점, 외국어 한자 및 기타기호, 조사 제거하는 코드 입니다. news_data = pd.read_excel("./all_necessary_xlsx", sep='delimiter') np_news_data = np.array(news_data) okt = Okt() wb = openpyxl.Workbook() sheet = wb.active n_contain = ["Punctuation", "Foreign", "Josa"] for i in range(np_news_data.shape[0]): title = okt.pos(np_news_data[i][3]) content = okt.pos(np_news_data[i][4]) temptitle = [] tempcontent = [] for word, tag in title: if tag not in n_contain: temptitle.append(word) temptitle.append(" ") for word, tag in content: if tag not in n_contain: tempcontent.append(word) tempcontent.append(" ") t = "".join(temptitle)
# print(a_url) source_article = urllib.request.urlopen((a_url)) soup = BeautifulSoup(source_article, 'lxml', from_encoding='utf-8') contents = soup.select('div.article_txt') for imsi in contents: item = str(imsi.find_all(text=True)) # print(item) msg = msg + item print(msg) from konlpy.tag import Okt from collections import Counter nlp = Okt() nouns = nlp.nouns(msg) result = [] for imsi in nouns: if len(imsi) > 1: result.append(imsi) print(result) count = Counter(result) print(count) tag = count.most_common(50) # 상위 50개 단어만 워드 클라우드에 참여시킨다 import pytagcloud taglist = pytagcloud.make_tags(tag, maxsize=100) print(taglist[:10])
UNK = "[UNK]" CLS = "[CLS]" MASK = "[MASK]" SEP = "[SEP]" SEG_A = "[SEG_A]" SEG_B = "[SEG_B]" NUM = "<num>" padding_token = PAD cls_token = CLS sep_token = SEP special_tokens = [PAD, START_TOKEN, END_TOKEN, UNK, CLS, MASK, SEP, SEG_A, SEG_B, NUM] # 토큰화 시킬 라이브러리 함수 설정 split_fn = Okt().morphs # 라벨 데이터 원핫인코딩 위한 config ner_tag_size = 0 # 불러온 데이터 전처리 결과 리스트 변수 list_of_total_source_no, list_of_total_source_str, list_of_total_target_str = [], [], [] # 전처리 리스트 변수 list_of_X_tokens = [] list_of_padded_X_token_ids_with_cls_sep = [] list_of_padded_ner_ids_with_cls_sep = [] # padding congif maxlen = 0
import codecs from konlpy.tag import Okt fp = codecs.open('test.txt', 'r', encoding='utf-8') text = fp.read() # 텍스트를 한 줄씩 처리하기 twitter = Okt() word_dic = {} lines = text.split('\r\n') for line in lines: malist = twitter.pos(line) for word in malist: if word[1] == 'Noun': if not (word[0] in word_dic): word_dic[word[0]] = 0 word_dic[word[0]] += 1 #카운트하기 # 많이 사용된 명사 출력하기 keys = sorted(word_dic.items(), key=lambda x:x[1], reverse=True) for word, count in keys[:50]: print('{0}({1}) '.format(word, count), end='') print()
def main(args): os.mkdirs(args.save_path) df = pd.read_csv(args.data_path) df = df.iloc[:, 1:3] df = df.dropna() X = [] for x in df['Comment']: X.append([x]) y = [] for i in df['Label']: y.append(i) train_X, test_X, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # remove emojis emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) # remove stopwords han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d]') train_X = preprocess(train_X) test_X = preprocess(test_X) stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다'] okt = Okt() X_train = [] for sentence in train_X: temp_X = [] temp_X = okt.morphs(sentence[0], stem=True) # tokenize temp_X = [word for word in temp_X if not word in stopwords] # remove stopwords print(temp_X) X_train.append(temp_X) X_test = [] for sentence in test_X: temp_X = [] temp_X = okt.morphs(sentence[0], stem=True) # 토큰화 temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거 X_test.append(temp_X) y_train = to_categorical(y_train) y_test = to_categorical(y_test) # ============== Encoding =============== tokenizer = Tokenizer() tokenizer.fit_on_texts(X_train) threshold = 3 total_cnt = len(tokenizer.word_index) # number of words rare_cnt = 0 # cound number of words with frequency less than threshold total_freq = 0 # sum of all word frequency of train data rare_freq = 0 # sum of all word frequency of wors with frequency less than the threshold # accept pair of word and frequency as key and value for key, value in tokenizer.word_counts.items(): total_freq = total_freq + value # if word frequency is less than threshold if(value < threshold): rare_cnt = rare_cnt + 1 rare_freq = rare_freq + value print('Size of vocabulary :',total_cnt) print('Words with frequency less than threshold %s: %s'%(threshold - 1, rare_cnt)) print("Percentage of rare words:", (rare_cnt / total_cnt)*100) print("Percentage of frequency of rare words:", (rare_freq / total_freq)*100) # Remove words with freqeuncy less than 2 vocab_size = total_cnt - rare_cnt + 2 print('Size of vocabulary:',vocab_size) tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) # Remove empty sample drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1] X_train = np.delete(X_train, drop_train, axis=0) y_train = np.delete(y_train, drop_train, axis=0) print(len(X_train)) print(len(y_train)) # Padding print('Length of longest comment :',max(len(l) for l in X_train)) print('Length of average comment :',sum(map(len, X_train))/len(X_train)) plt.hist([len(s) for s in X_train], bins=50) plt.xlabel('length of samples') plt.ylabel('number of samples') plt.show() max_len = 30 below_threshold_len(max_len, X_train) X_train = pad_sequences(X_train, maxlen = max_len) X_test = pad_sequences(X_test, maxlen = max_len) # =============== Train model ================ # define model model = Sequential() model.add(Embedding(vocab_size, 100)) model.add(LSTM(128)) model.add(Dense(3, activation='softmax')) es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4) mc = ModelCheckpoint(os.path.join(args.save_path, 'best_model_raw_v2.h5'), monitor='val_acc', mode='max', verbose=1, save_best_only=True) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]) history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2) # ============== Save results ================== plt.plot(history.history['recall'], label = 'recall') plt.plot(history.history['acc'], label = 'accuracy') plt.plot(history.history['precision'], label = 'precision') plt.legend(['recall','accuracy','precision']) plt.title('LSTM metrics') plt.savefig(os.path.join(args.save_path, 'LSTM metrices')) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['loss', 'validation loss']) plt.title('LSTM loss') plt.savefig(os.path.join(args.save_path, 'loss')) plt.plot(history.history['val_recall']) plt.plot(history.history['val_acc']) plt.plot(history.history['val_precision']) plt.legend(['recall','accuracy','precision']) plt.title('LSTM validation metrics') plt.savefig(os.path.join(args.save_path, 'LSTM validation metrics'))
def __init__(self): self.texts = [] self.tokens = [] self.okt = Okt() self.stopwords = [] self.freqtxt = []
class Service: def __init__(self): self.texts = [] self.tokens = [] self.okt = Okt() self.stopwords = [] self.freqtxt = [] ########################################################################################### def extract_token(self, payload): print('>>> text 문서에서 token 추출') filename = payload.context + payload.fname with open(filename, 'r', encoding='utf-8') as f: self.texts = f.read() print(f'{self.texts[:300]}') def extract_hanguel(self): print('>>> 한글만 추출') texts = self.texts.replace('\n', ' ') tokenizer = re.compile(r'[^ ㄱ-힣]') self.texts = tokenizer.sub('', texts) print(f'{self.texts[:300]}') def conversion_token(self): print('>>> 토큰으로 변환') self.tokens = word_tokenize(self.texts) print(f'{self.tokens[:300]}') def compound_noun(self): print('>>> 복합명사는 묶어서 fitering 으로 출력') print('>>> ex) 삼성전자의 스마트폰은 --> 삼성전자 스마트폰') noun_token = [] for token in self.tokens: token_pos = self.okt.pos(token) temp = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == 'Noun'] if len("".join(temp)) > 1: noun_token.append("".join(temp)) self.texts = " ".join(noun_token) print(f'{self.texts[:300]}') def extract_stopword(self, payload): print('>>> text 문서에서 token 추출') filename = payload.context + payload.fname with open(filename, 'r', encoding='utf-8') as f: self.stopwords = f.read() self.stopwords = self.stopwords.split(' ') print(f'{self.stopwords[:10]}') def filtering_text_with_stopword(self): print('>>> stopword 로 필터링 ') self.texts = word_tokenize(self.texts) self.texts = [text for text in self.texts if text not in self.stopwords] def frequent_text(self): print('>>> 빈도수로 정렬 ') self.freqtxt = pd.Series(dict(FreqDist(self.texts)))\ .sort_values(ascending=False) print(f'{self.freqtxt[:10]}') def draw_wordcloud(self, payload): print('>>> 워드크라우드 작성 ') filename = payload.context + payload.fname wcloud = WordCloud(filename, relative_scaling=0.2, background_color='white').generate(" ".join(self.texts)) plt.figure(figsize=(12,12)) plt.imshow(wcloud, interpolation='bilinear') plt.axis('off') plt.show() ########################################################################################### # 문장을 넣어서 단어와 품사를 튜플로 만들어 리스트 형태로 반환 @staticmethod def sentence_pos(sentence): print('# before user dic') komo = Komoran() result = komo.pos(sentence) print('전체 확인하기') for myitem in result: somedata = '단어 : %s, 품사 : %s' % (myitem[0], myitem[1]) print(somedata) print('-' * 30) return result ''' 리턴값 # before user dic [('국정', 'NNG'), ('농', 'NNG'), ('단', 'NNG'), ('태블릿 PC', 'NNP'), (',', 'SP'), ('설', 'NNB'), ('진', 'NNP'), ('욱', 'NA'), (',', 'SP'), ('가나', 'NNP'), ('다라', 'NNP')] ''' @staticmethod def pos_to_noun(sentence): komo = Komoran() print('명사만 추출해보기') nouns = komo.nouns(sentence) print(nouns) return nouns ########################################################################################### # word cloud를 생성한다. @staticmethod def makeWordCloud(context,wordDict,imageFile,fontpath,filename): # 워드 클라우드 # 이미지를 넘파이 배열로 바꿉니다. imageFile=context+imageFile fontpath =context+fontpath filename =context+filename alice_coloring = np.array(Image.open(imageFile)) wordcloud = WordCloud(font_path=fontpath, mask=alice_coloring, relative_scaling=0.2, background_color='lightyellow') wordcloud = wordcloud.generate_from_frequencies(wordDict) image_colors = ImageColorGenerator(alice_coloring) # random_state : 랜덤 상수 지정 newwc = wordcloud.recolor(color_func=image_colors, random_state=42) plt.imshow(newwc) plt.axis('off') plt.savefig(filename) plt.figure(figsize=(16, 8)) @staticmethod def makeBarChart(context,wordlist,filename): # 막대 그래프 filename = context+filename # result를 이용하여 막대 그래프를 그려 보세요. result = wordlist[0:10] # 10개 데이터 barcount = 10 # 막대 갯수 : 10개만 그리겠다. xlow, xhigh = - 0.5, barcount - 0.5 result = wordlist[:barcount] chartdata = [] # 차트 수치 xdata = [] # 글씨 mycolor = ['r', 'g', 'b', 'y', 'm', 'c', '#FFF0F0', '#CCFFBB', '#05CCFF', '#11CCFF'] for idx in range(len(result)): chartdata.append(result[idx][1]) xdata.append(result[idx][0]) value = str(chartdata[idx]) + '건' # 예시 : 60건 # 그래프의 위에 "건수" 표시 plt.text(x=idx, y=chartdata[idx] - 20, s=value, fontsize=8, horizontalalignment='center') plt.xticks(range(barcount), xdata, rotation=45) plt.bar(range(barcount), chartdata, align='center', color=mycolor) plt.title('상위 ' + str(barcount) + '빈도수') plt.xlim([xlow, xhigh]) plt.xlabel('주요 키워드') plt.ylabel('빈도수') plt.savefig(filename, dpi=400, bbox_inches='tight') print(filename + ' 파일이 저장되었습니다.') @staticmethod def make_wordlist(context,txt,stopwordTxt): filename = context +txt ko_con_text = open(filename, 'rt', encoding='utf-8').read() okt = Okt() token_ko = okt.nouns(ko_con_text) # 불용어(stopword) : 빈도 수에 상관없이 분석에서 배제할 단어들 stop_word_file = context +stopwordTxt stop_file = open(stop_word_file, 'rt', encoding='utf-8') stop_words = [word.strip() for word in stop_file.readlines()] # print(stop_words) token_ko = [each_word for each_word in token_ko if each_word not in stop_words] # nltk : national language toolkit # token : 작은 절편 ko = nltk.Text(tokens=token_ko) wordlist = list() # 튜플(단어, 빈도수)를 저장할 리스트 # 가장 빈도수가 많은 500개만 추출 data = ko.vocab().most_common(500) # print(data) for word, count in data: if (count >= 50 and len(word) >= 2): wordlist.append((word, count)) return wordlist @staticmethod def create_word2vec(context,filename,prepro_file,model_filename): myencoding = 'utf-8' filename = context +filename myfile = open(filename, 'rt', encoding=myencoding) soup = BeautifulSoup(myfile, 'html.parser') mydata = soup.text # print(mydata) results = [] # 결과 저장소 okt = Okt() datalines = mydata.split('\n') print(len(datalines)) for oneline in datalines: mypos = okt.pos(oneline, norm=True, stem=True) # print(mypos) imsi = [] # 임시 리스트 for word in mypos: if not word[1] in ['Josa', 'Eomi', 'Punctuation', 'Verb']: if len(word[0]) >= 2: imsi.append(word[0]) temp = (' '.join(imsi)).strip() results.append(temp) # break # 차후 삭제 예정 # print(results) # 정제된 파일로 저장하기 with open(prepro_file, 'wt', encoding=myencoding) as myfile: myfile.write('\n'.join(results)) print(prepro_file + ' 파일 생성됨') # word2vec : word(단어)들을 벡터로 만드는 알고리즘 # vector(벡터) : 크기와 방향을 가지고 있는 단위 # 스칼라 : only 값 # 단어들의 유사도 : 코싸인 유사도, 유클리디언 유사도, 맨하탄 유사도 # LineSentence : 분석을 하기 위한 sentence를 만들어 주는 함수 data = word2vec.LineSentence(prepro_file) print(type(data)) # Word2Vec : 해당 sentence를 사용하여 word2vec에 대한 모델을 생성해줍니다. # size : 벡터의 차원수, window : 윈도우 사이즈, min_count : 버리고자 하는 최소 빈도수 # sg : 1(skipgram), 0(cbow) model = word2vec.Word2Vec(data, size=200, window=10, min_count=2, sg=1) print(type(model)) # 모델을 저장할 때는 save 함수를 사용합니다. # 모델 파일은 바이트 형식의 파일입니다. model.save(model_filename) print(model_filename + ' 파일 생성됨') print('finished') @staticmethod def showGraph(bargraph): length = len(bargraph) # 요소 갯수 # x축에 보이는 글자 myticks = list(mydata[0] for mydata in bargraph) # 그려질 수치 데이터 chartdata = list(mydata[1] for mydata in bargraph) mycolor = ['b', 'g', 'r', 'c', 'm', 'y', 'k', '#56FFCC', '#00CCFF', '#CCDDEE'] plt.figure() plt.barh(myticks, chartdata, color=mycolor, align='center') plt.yticks(range(length), myticks, rotation='10') plt.xlim(min(chartdata) - 0.02, max(chartdata) + 0.02) filename = 'word2vec_model_01.png' plt.savefig(filename) print(filename + ' 파일 저장됨') @staticmethod def makePie(piegraph): myticks = list(mydata[0] for mydata in piegraph) chartdata = list(mydata[1] for mydata in piegraph) mycolor = ['b', 'g', 'r', 'c', 'm'] plt.figure() plt.pie(chartdata, colors=mycolor, labels=myticks, startangle=90, shadow=False, explode=(0, 0.05, 0, 0, 0), autopct='%1.2f%%', normalize=True) filename = 'word2vec_model_02.png' plt.savefig(filename) print(filename + ' 파일 저장됨') print('finished')
from konlpy.tag import Okt from gensim.models import Word2Vec import gensim import pandas as pd okt = Okt() df = pd.read_csv('CSV파일명.csv', encoding='utf-8') def arrToStr(arr): return arr.str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", " ") result = [] for i, rcp in enumerate(df['recipe']): tokenlist = okt.pos(rcp, stem=True, norm=True) temp = [] for word in tokenlist: if word[1] in ["Noun"]: # 명사일 때만 temp.append((word[0])) temp = temp + df.loc[i, 'cleand'].split() result.append(temp) # 결과에 저장 df.loc[i, 'cleand'] = str(' '.join(temp)) if i % 500 == 0: print("%d번째 While문." % i) print(temp) df.to_csv('CSV파일명.csv', encoding='utf-8') print('총 샘플의 개수 : {}'.format(len(result)))
from konlpy.tag import Okt from gensim.models import word2vec # Word2Vec 모델 읽어 들이고 형태소 분석 준비하기 model = word2vec.Word2Vec.load("./wiki.model") okt = Okt() def print_emargency(text): print(text) # 전달된 문장을 형태소 분석하기 node = okt.pos(text, norm=True, stem=True) for word, form in node: # 필요한 형태소만 추출하기 if form == 'Noun' or form == 'Verb' or form == 'Adjective' or form == 'Adverb': # 급하다와 비슷한 단어 print("-", word, ":", model.wv.similarity(word, '급하다')) print_emargency("컴퓨터에 문제가 생겼어요. 빨리 해결해야 하는 문제가 있어서 지원 요청합니다.") print_emargency("사용 방법을 잘 모르겠습니다.")
# plt.rcParams['axes.unicode_minus'] = False plt.rcParams["figure.figsize"] = [12, 6] # %% df.tail() # %% from konlpy.tag import Kkma kkma = Kkma() from konlpy.tag import Okt okt = Okt() df["kkma"] = '' df["okt"] = '' for i in range(0, len(df)): # df['kkma'][i] = kkma.morphs(df["질문"][i]) # df['okt'][i] = okt.morphs(df["질문"][i]) df['kkma'][i] = kkma.pos(df["질문"][i]) df['okt'][i] = okt.pos(df["질문"][i]) df.head() # %% import re
def morphs_convert(): target = 'all' # ------------------------------------- target_name_txt = './data/text/' + target + '_names_kor.txt' target_name = './data/hexcolor_vf/kor_' + target + '_names.pkl' target_name_okt = './data/hexcolor_vf/kor_' + target + '_names_okt.pkl' target_name_mecab = './data/hexcolor_vf/kor_' + target + '_names_mecab.pkl' if target == 'all': target_palette_name = './data/hexcolor_vf/train_palettes_rgb.pkl' else: target_palette_name = './data/hexcolor_vf/' + target + '_palettes_rgb.pkl' okt = Okt() mecab = Mecab() name_seqs = [] name_seqs_m = [] # 텍스트 파일 전처리 txt_file = open(target_name_txt, mode='rt', encoding='utf-8') file = txt_file.readlines() file = list(map(lambda s: s.strip(), file)) print(f'----텍스트 라인 수 : {len(file)}----') # 일반 텍스트 피클 파일 생성 with open(target_name, 'wb') as pkl: pickle.dump(file, pkl) with open(target_name, "rb") as pkl: pkl_load = pickle.load(pkl) print(f'----일반 pkl 라인 수 : {len(pkl_load)}----') # # OKT 작업 # for i, tmp in enumerate(file): # tmp = okt.morphs(tmp) # name_seqs.append(tmp) # # with open(target_name_okt,'wb') as pkl : # pickle.dump(name_seqs, pkl) # # with open(target_name_okt, "rb") as pkl: # pkl_load = pickle.load(pkl) # print(f'----OKT pkl 라인 수 : {len(pkl_load)}----') # # # Mecab 작업 # for j, tmp_m in enumerate(file): # tmp_m = mecab.morphs(tmp_m) # name_seqs_m.append(tmp_m) # # with open(target_name_mecab,'wb') as pkl : # pickle.dump(name_seqs_m, pkl) # # with open(target_name_mecab, "rb") as pkl: # pkl_load = pickle.load(pkl) # print(f'----Mecab pkl 라인 수 : {len(pkl_load)}----') # 팔레트 라인과 비교 with open(target_palette_name, 'rb') as f: palette_data = pickle.load(f) print(f'----팔레트 라인 수 : {len(palette_data)}----')
print("%d번째 뉴스기사 크롤링 실패" % (i + 1)) else: #가져올 내용이 있다면 print("%d번째 뉴스기사 크롤링 성공" % (i + 1)) #수집결과에서 불필요한 HTML 태그 제거 for item in news_html: crawler.remove(item, "script") crawler.remove(item, "a") crawler.remove(item, "br") crawler.remove(item, "span", {"class": "end_photo_org"}) #공백을 제거한 텍스트만 미리 준비한 변수에 누적 news_content += item.text.strip() #4) 수집결과를 기반으로 형태소 분석 #형태소 분석 객체를 통해 수집된 뉴스 본문에서 명사만 추출 nlp = Okt() nouns = nlp.nouns(news_content) #명사들에 대한 빈도수 검사 count = Counter(nouns) #가장 많이 사용된 단어 100개 추출 most = count.most_common(100) #추출 결과를 워드클라우드에서 요구하는 형식으로 재구성 # → {"단어":빈도수, "단어":빈도수, ...} tags = {} lists = [] for n, c in most: if len(n) > 1: tags[n] = c
def insert_keyword(): mongoDB = myMongoDB("CapstoneTest") okt = Okt() min_count = 1 # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length = 10 # 단어의 최대 길이 string_idx = 0 total_clean_sentence = [] string_id = [] stop_words = [ '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '섯알', '가운데', '보이', '아니', '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '김재', '종', '매사', '스스로', '하자', '그렇', '위하', '대한', '확', '관련', '이상', '미만', '경우', '텔레', '다시', '때문', '대규모', '뭔가', '디섐보', '퍼터', '제대로', '관', '지난', '비준', '지난해', '위해', '곳곳', '현재', '당일', '주요', '일대', '기', '날', '코로', '물이', '간사', '요즘', '거기', '내', '지금', '정도', '이번', '처음', '모두', '통해', '더욱', '앞서', '진짜', '거', '올레', '가가', '해도', '한번', '원래', '사실', '옆', '정말', '올해', '스', '민', '초', '최근', '앞', '역시', '이후', '군', '먼저', '노', '해당', '최고', '가장', '중', '양', '대해', '사이', '얼마', '아주', '대비', '셈', '각국', '실거주', '실수요자', '실', '대부분', '섯알', '셀', '내년', '유독', '언제', '문득', '늘', '다른', '동안', '덩', '역시', '당시', '최', '변', '살', '이번', '씨', '랄라블', '점차', '건수', '번', '쥴', '리', '상대로', '송', '이제', '매년', '곳', '오늘', '듯', '아무', '괜', '하나', '차지', '오히려', '순간', '속', '누군가', '밥주', '스마', '문하', '정유', '주얼', '좀더', '먼저', '디섐보', '일주', '것처', '에브리' '이전', '비대', '각종', '임', '누구', '일일', '필', '부', '트럼', '초등학', '이하', '에브리' ] for content in mongoDB.collected.find({}, {"_id": 1, "content": 1}): cleaned_sentence = [] clean_sentence = [] string_id.append(list(content.values())[0]) string = list(content.values())[1] string = string.replace(u'\xa0', u' ') string = string.replace(u'\n', u' ') string = string.replace(u'\r', u' ') clean_sentence.append(sent_tokenize(string)) for i in clean_sentence: for j in i: cleaned_sentence.append(j) total_clean_sentence.append(cleaned_sentence) for clean_sentence in total_clean_sentence: noun_keyword_list = [] stop_keyword_list = [] keyword_list = [] wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length) beta = 0.85 max_iter = 10 try: keywords, rank, graph = wordrank_extractor.extract( clean_sentence, beta, max_iter) except ValueError: mongoDB.collected.update_one({'_id': string_id[string_idx]}, {'$set': { 'keyword': 'keywords' }}) string_idx += 1 continue for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:]: keyword_list.append(word) for i in keyword_list: a = okt.pos(i) if a[0][1] == 'Noun': noun_keyword_list.append(a[0][0]) for i in noun_keyword_list: if i not in stop_words: stop_keyword_list.append(i) if len(stop_keyword_list) == 0: stop_keyword_list.append('') s1 = set(stop_keyword_list) s1_list = list(s1) s2_list = s1_list[:5] mongoDB.collected.update_one( {'_id': string_id[string_idx]}, {'$set': { 'keyword': s1_list, 'point_keyword': s2_list }}) string_idx += 1
pip install konlpy # In[1]: from konlpy.tag import Okt #윈도우 사용가능 태깅 중 제일 빠름 from konlpy.utils import pprint # In[2]: okt = Okt() # In[3]: pprint(okt.morphs(u'장대한 생물 진화사를 치킨으로 정리해버리는 씹좆간쉨ㅋㅋㅋㅋ')) # In[4]: pprint(okt.nouns(u'장대한 생물 진화사를 치킨으로 정리해버리는 씹좆간쉨ㅋㅋㅋㅋ')) # In[5]:
# Corpus (말뭉치) : 언어의 표본을 담아둔 묶은(사전) from konlpy.corpus import kobill # 정치 관련 사전 # 파일 읽어오기 (1) files_ko = kobill.fileids() # print(files_ko) # 문서 확인하기. # 파일 읽어오기 (2) doc_ko = kobill.open(r'mynews.txt').read() # print(doc_ko) # 의미 단어 추출 (Tokenize) from konlpy.tag import Okt t = Okt() tokens_ko = t.morphs(doc_ko) # 문서를 token으로 분리 print(tokens_ko) import nltk ko = nltk.Text(tokens_ko, name='원내대표') # 그런 단어가 있는지 검색할때 쓰는 nltk.Text print(ko) print('토큰 정보 확인-----') print(len(ko.tokens)) # 608개 print(len(set(ko.tokens))) #유니크한개 293개 fre_dist = ko.vocab() print(fre_dist) # <FreqDist with 293 samples and 608 outcomes> # from matplotlib import rc # rc('font', family='malgun gothic') # ko.plot(50)
from konlpy.tag import Okt import re #import nltk #nltk.download() from nltk.tokenize import word_tokenize import pandas as pd from nltk import FreqDist from wordcloud import WordCloud import matplotlib.pyplot as plt import numpy as np okt = Okt() ctx = '../data/' filename = ctx + 'kr-Report_2018.txt' with open(filename, 'r', encoding='utf-8') as f: texts = f.read() print(texts[:300]) texts = texts.replace('\n', '') tokenizer = re.compile('[^ ㄱ.힣]+') texts = tokenizer.sub('', texts) tokens = word_tokenize(texts) noun_token = [] for token in tokens: token_pos = okt.pos(token) temp = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == "Noun"] if len(''.join(temp)) > 1: noun_token.append("".join(temp)) texts = " ".join(noun_token) with open(ctx + 'stopwords.txt', 'r', encoding='UTF-8') as f:
import unittest from konlpy.tag import Okt '''class MyTestCase(unittest.TestCase): def test(self): okt = Okt() self.assertEqual(u'잃' in okt.morphs(u'잃어버리다'), True) if __name__ == '__main__': unittest.main()''' okt = Okt() print( okt.morphs(u'9 월 22 일에 아이패드를 잃어버렸어요..... 찾게 도와주세요 ㅠㅠㅠㅠㅠㅠ', norm=True, stem=True))
from bs4 import BeautifulSoup as bs from konlpy.tag import Okt import requests url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=161967&type=after&onlyActualPointYn=N&onlySpoilerPointYn=N&order=newest" html = bs(requests.get(url).content, "html.parser", from_encoding="utf-8") cnt = html.select( "body > div > div > div.score_total > strong > em")[0].contents[0].replace( ',', '') lst = [] dict = {} okt = Okt() #konlpy 모듈을 호출하여 변수 okt로 설정 for x in range(1, int(cnt) // 10 + 2): # 1페이지부터 수상일 이후의 댓글을 도출하고자함. html = bs(requests.get(url + "&page=" + str(x)).content, "html.parser", from_encoding="utf-8") size = len( html.select( "body > div > div > div.score_result > ul > li > div.score_reple > p" )) # 페이지당 댓글의 수만큼 for문을 돌리도록 설정(마지막 페이지에 댓글이 10개가 아닐 수도 있기 때문) for i in range(1, size + 1): #유저의 리뷰를 단 날짜 크롤링 코드 date = html.select( "body > div > div > div.score_result > ul > li:nth-of-type(" + str(i) + ") > div.score_reple > dl > dt > em:nth-of-type(2)")[0].contents[0]
def _add_tokens(df): okt = Okt() df['tokens'] = df['review'].map(lambda x: _get_tokens(okt, x)) return df
import nltk, re, pprint from nltk.tokenize import word_tokenize from urllib import request from bs4 import BeautifulSoup import pickle from urllib.parse import urlencode, quote import konlpy from konlpy.tag import Okt okt = Okt() ROOT_URL = "http://korlex.pusan.ac.kr/search/WebApplication2/KorLex_SearchPage.aspx" easy_vocab_list = [] with open('./TOPIC_vocab_list.csv', 'r') as csvfile: for line in csvfile.readlines(): array = line.split(',') easy = array[1].split('0')[0] easy = easy.split('1')[0] easy_vocab_list.append(easy) text_file = open("Textbook_middle.txt", "r") text = text_file.read() easy_corpus_list = text.split() def preprocess_word(word): return okt.pos(word, stem=True) def score_with_easy_list(word):
import numpy as np from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer import matplotlib.pyplot as plt from pprint import pprint import pickle data = pd.read_excel('./dataset.xlsx') X = data['input'] Y = data['output'] X = X.str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 특수문자 제거 stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다'] # 불용어 okt = Okt() x = [] for sentence in X: temp_X = [] temp_X = okt.morphs(sentence, stem=True) # 토큰화 temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거 x.append(temp_X) X = x with open('sentences.txt', 'wb') as f: pickle.dump(X, f) max_words = 3000 t = Tokenizer(num_words=max_words) # 상위 3000개의 단어만 보존 t.fit_on_texts(X) X = t.texts_to_sequences(X) print(X[:5])
import pymysql import jieba import re import konlpy from konlpy.tag import Okt from collections import Counter # encoding: utf-8 conn = pymysql.connect( # 创建数据库连接 host='gujiakai.softether.net', # 要连接的数据库所在主机ip user='******', # 数据库登录用户名 password='******', # 登录用户密码 database='library', # 连接的数据库名,也可以后续通过cursor.execture('user test_db')指定 charset='utf8mb4' # 编码,注意不能写成utf-8 ) t = Okt() cursor = conn.cursor() cursor.execute("select question from qna") res = cursor.fetchall() cursor.execute("delete from wc") cut_words = "" res = str(res) nouns = t.nouns(res) cut_words = "" for con in nouns: scon = str(con) print(scon) cursor.execute("INSERT INTO wc (world) VALUES('%s')" % (scon)) conn.commit()
class UserData: def __init__(self): self.okt = Okt() def read_file(self): self.okt.pos("morph", stem=True) cheese_data = pd.read_csv("com_cheese_api/user/data/users.csv") cheese_lists = list(cheese_data['cheese_name']) tests = ''.join(cheese_lists) #print(cheese_str) #print(type(cheese_str)) return texts @staticmethod def extract_hangeul(texts): temp = texts.replace('\n', ' ') tokenizer = re.compile(r'[^ㄱ-힣]+') temp = tokenizer.sub('', temp) return temp @staticmethod def change_token(texts): tokens = word_tokenize(texts) return tokens def extract_noun(self): noun_tokens = [] tokens = self.change_token(self.extract_hangeul(self.read_file())) for token in tokens: token_pos = self.okt.pos(token) temp = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == 'Noun'] if len(''.join(temp)) > 1: noun_tokens.append("".join(temp)) texts = " ".join(noun_tokens) return texts @staticmethod def download(): nltk.download() @staticmethod def read_stopword(): with open('com_cheese_api/user/data/stopword.txt', 'r') as file: lines = file.readlines() stop_str = ''.join(lines) stopword = stop_str.replace('\n', ' ') stopwords = stopword.split(' ') return stopwords def remove_stopword(self): texts = self.extract_noun() tokens = self.change_token(texts) stopwords = self.read_stopword() texts = [text for text in tokens if text not in stopwords] return texts def hook(self): texts = self.remove_stopword() freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False) print(freqtxt[:100]) return freqtxt def draw_wordcloud(self): texts = self.remove_stopword() wcloud = WordCloud('/usr/share/fonts/truetype/nanum/NanumBarunGothicBold.ttf', background_color='white', width=800, height=600) cloud = wc.generate_from_frequencies(dict(tags)) plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud) plt.show()