def __init__(self, filepath, tagger=None): if tagger: self.tagger = tagger else: self.tagger = Twitter() self.filepath = filepath self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def tokenize_okt_noscreen(df): okt = Twitter() okt.add_dictionary(call_userword(), 'Noun') stopwords = load_wordset('./tokenizer/korean_stopword.txt') #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt') stopwords = list(stopwords) df['content_token'] = df.progress_apply( lambda x: text_tokenize(x['content'], okt, stopwords), axis=1) df['title_token'] = df.progress_apply( lambda x: text_tokenize(x['title'], okt, stopwords), axis=1) return df
def __init__(self): self.reg_reporter = re.compile('[가-힣]+\s[가-힣]*기자') # 기자 self.reg_email = re.compile( '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$') # 이메일 self.reg_eng = re.compile('[a-z]+') # 소문자 알파벳, 이메일 제거용, 대문자는 남겨둔다 self.reg_chi = re.compile("[\u4e00-\u9fff]+") # 한자 self.reg_sc = re.compile( "·|…|◆+|◇+|▶+|●+|▲+|“|”|‘|’|\"|\'|\(|\)|\W+") # 특수문자 self.reg_date = re.compile( '\d+일|\d+월|\d+년|\d+시|\d+분|\(현지시간\)|\(현지시각\)|\d+') # 날짜,시간,숫자 self.twitter_obj = Twitter() self.stopwords = [] self.noun_list = []
def __init__(self, textIter, tagger=None): # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가 f = open('형태소 보완.txt') dd = f.read() a = dd.split('\n') if tagger: self.tagger = tagger else: self.tagger = Twitter() self.tagger.add_dictionary(a, 'Noun') if type(textIter) == str: self.textIter = textIter.split('\n') else: self.textIter = textIter self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def run_twitter(news): twitter = Twitter() start_time = time.time() print('twitter 시작') # twitter_morphs = twitter.morphs(news) twitter_nouns = twitter.nouns(news) # twitter_pos = twitter.pos(news) end_time = time.time() # print(twitter_pos) print('twitter 끝 - %s 초' % str(end_time - start_time)) with open('twitter_noun.txt', 'w', encoding='utf-8') as fstream: # fstream.write('twitter time : %s s\n' % str(end_time - start_time) ) # fstream.write('twitter_morphs\n') # write_list(twitter_morphs, fstream) # fstream.write('\n\n') # fstream.write('twitter_nouns\n') write_list(twitter_nouns, fstream) fstream.write('\n\n')
def Tokenizer(data): import pandas as pd from ckonlpy.tag import Twitter twitter = Twitter() #사용자 사전 추가 txt = pd.read_csv('사용자 사전.txt', sep='\n') txt = txt['<사용자 사전>'] for line in txt: twitter.add_dictionary(txt, 'Noun') # 데이터 가져오기 data = data new_hashtags = data.hashtags.copy() # 토큰화 for i in range(len(new_hashtags)): new_hashtags[i] = ' '.join(new_hashtags[i]) tokenized = [] for sentence in new_hashtags: tokens = twitter.morphs(sentence) tokenized.append(tokens) # 연속된 중복 제거 new_tokenized = [] for x in range(len(tokenized)): temp = [] for y in range(len(tokenized[x]) - 1): if tokenized[x][y] != tokenized[x][y + 1]: temp.append(tokenized[x][y]) new_tokenized.append(temp) return new_tokenized
class RawTagger: def __init__(self, textIter, tagger=None): # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가 f = open('형태소 보완.txt') dd = f.read() a = dd.split('\n') if tagger: self.tagger = tagger else: self.tagger = Twitter() self.tagger.add_dictionary(a, 'Noun') if type(textIter) == str: self.textIter = textIter.split('\n') else: self.textIter = textIter self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))') def __iter__(self): for line in self.textIter: ch = self.rgxSplitter.split(line) for s in map(lambda a, b: a + b, ch[::2], ch[1::2]): if not s: continue yield self.tagger.pos(s)
class RawTaggerReader: def __init__(self, filepath, tagger=None): if tagger: self.tagger = tagger else: self.tagger = Twitter() self.filepath = filepath self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))') def __iter__(self): for line in open(self.filepath, encoding='utf-8'): ch = self.rgxSplitter.split(line) for s in map(lambda a, b: a + b, ch[::2], ch[1::2]): if not s: continue yield self.tagger.pos(s)
def tokenize_okt_noscreen(df): okt = Twitter() okt.add_dictionary(call_userword(), 'Noun') stopwords = load_wordset('./tokenizer/korean_stopword.txt') #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt') stopwords = list(stopwords) df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) return df
import time import string import datetime import csv from ckonlpy.tag import Twitter from selenium import webdriver from bs4 import BeautifulSoup driver = webdriver.Chrome("c:/Users/yooat/Downloads/chromedriver/chromedriver") driver.get('http://www.cheonan.go.kr/covid19/sub02_01.do') time.sleep(1) twitter = Twitter() html = driver.page_source soup = BeautifulSoup(html, 'html.parser') name = soup.find_all("dl",class_="item") f1 = open('corona.txt','w+t') for test in name: if "14일이" in test.get_text(): continue f1.write(test.get_text() + "\n") f1.close(); f1 = open('corona.txt','r') nowDate = datetime.datetime.now() c = csv.writer(open(nowDate.strftime("result_" + "%Y-%m-%d_%H-%M-%S") + ".csv","w",encoding="cp949")) for l in f1: c.writerow(twitter.nouns(l)) time.sleep(3)
import urllib.request from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import LTokenizer # -*- coding: utf-8 -*- from ckonlpy.tag import Twitter from pykospacing import spacing #띄어 쓰기 자동으로 해줌 sent = "위 인수들을 사용할 때 고려해야 될점이있습니다. audio 데이터의 어떤 시점에 하나의 단어가 언급되고 있다면 그 단어는 잘려서 이상하게 인식될 것입니다. 이 harvard 데이터는 실험 목적으로 녹음된 것이기 때문에 초 단위로 잘라도 단어가 잘리지 않은 것 입니다." new_sent = sent.replace(" ", '') print(new_sent) kospacing_sent = spacing(new_sent) print(sent) print(kospacing_sent) #특정 단어 명사로 설정 twitter = Twitter() #twitter.add_dictionary('띄어쓰기', 'Noun') print(twitter.morphs(kospacing_sent))
def token(self, title, ccontent, creplies): memory = psutil.Process(os.getpid()) T_OR_title = [] T_title = [] T_OR_ccontent = [] T_ccontent = [] T_OR_creplies = [] T_creplies = [] twitter = Okt() # 트위터 형태소 사전을 사용하기 위해 초기화 twitter.add_dictionary('백래시', 'Noun') twitter.add_dictionary('문재앙', 'Noun') #### 타이틀 토큰화 #print('1') for i in range(len(title)): a = twitter.pos(title[i]) b = [] #print('title[i]',i,title[i]) for j in range(len(a)): if a[j][1] != 'Punctuation': # 오류로 'Punctuation'에 해당하는 튜플 제거 b.append(a[j]) #print('3',j) T_OR_title.append(b) T_title.append(twitter.morphs(title[i])) #### ccontent 토큰화 try: c = twitter.pos(str(ccontent[i])) d = [] # print('ccontent[i]',i, ccontent[i]) for w in range(len(c)): if c[w][1] != 'Punctuation': # 오류로 'Punctuation'에 해당하는 튜플 제거 d.append(c[w]) #print('4',w) T_OR_ccontent.append(d) T_ccontent.append(twitter.morphs(str(ccontent[i]))) except RuntimeError as e: T_OR_ccontent.append('') T_ccontent.append(twitter.morphs('')) ### 댓글 토큰화 #print('creplies[i]',i,creplies[i]) if type(creplies[i]) == str: # string형 댓글 토큰화 a = [creplies[i]] # string을 리스트로 변경 e = twitter.pos(str(a)) f = [] for u in range(len(e)): if e[u][1] != 'Punctuation': f.append(e[u]) elif e[u][1] != 'KoreanParticle': f.append(e[u]) else: break #print('5',u) T_OR_creplies.append(f) T_OR_creplies.append(twitter.pos(str(a))) T_creplies.append(twitter.morphs(str(a))) else: temp = [] temp2 = [] x = [] for n in range(len(creplies[i])): ### 리스트로 반환되는 댓글 h = twitter.pos(creplies[i][n]) #print('6',n) for z in range(len(h)): if h[z][1] != 'Punctuation': x.append(h[z]) elif h[z][1] != 'KoreanParticle': x.append(h[z]) else: break # print('7',z) # print('8',) #print('h',z,h) temp.append(x) temp2.append(twitter.morphs(creplies[i][n])) T_OR_creplies.append(temp) T_creplies.append(temp2) return T_OR_title, T_title, T_OR_ccontent, T_ccontent, T_OR_creplies, T_creplies
spelled_sent = spell_checker.check(sent) hanspell_sent = spelled_sent.checked print(hanspell_sent) test_sent = spell_checker.check(sent2) test2 = test_sent.checked print(test2) # !pip install konlpy # !pip install customized_konlpy from ckonlpy.tag import Twitter twitter = Twitter() twitter.morphs('은경이는 사무실로 갔습니다.') twitter.add_dictionary('은경이', 'Noun') print(twitter.morphs('은경이는 사무실로 갔습니다.')) # 오늘의 마지막 문제. 문법에 맞지 않은 한글 문장을 문법검사해서 잘 구성한 다음 토큰화해서 RNN신경망에 입력되기 전 데이터인 # 훈련 데이터와 라벨로 구성되게 하시오 test = "RNN신경망은 너무어려워서 하나도모르 겠습니다.\n 외않되는지매 일공부해 봐도모르 겠습니다.\n 살려주세 요." gram_test = spell_checker.check(test) rs_test = gram_test.checked print(rs_test) from tensorflow.keras.utils import to_categorical
#%% import numpy as np import pandas as pd from collections import Counter from pathlib import Path import argparse from ckonlpy.tag import Twitter from ckonlpy.tag import Postprocessor twitter = Twitter() import ast # 각 attr keyword가 review에 포함되면 각 attr에 해당하는 review들을 분류 #%% attr_df = pd.read_excel(Path( "G:/공유 드라이브/속성사전/Data/Electronics/RiceCooker/2. Seed D_전기밥솥_copy_jk_20201130.xlsx" ), sheet_name='1B') review_df = pd.read_csv( Path("G:/공유 드라이브/속성사전/Data/Electronics/RiceCooker/review_20201129.csv")) review_list = [(body, rating) for body, rating in zip(review_df['body'], review_df['rating'])] # %% #attr별로 리뷰 모아주려고 만든 dict cooker_review = {} for attr in attr_df['attrs']: cooker_review[attr] = [] #attr별로 filter 1차로 걸 키워드 모아놓은 dict
import math import pickle from tqdm import tqdm from datetime import datetime from pymongo import MongoClient from collections import Counter from konlpy.tag import Okt; okt = Okt() from ckonlpy.tag import Twitter; spliter = Twitter() def AddWord(add_word, pos): spliter.add_dictionary(add_word, pos) def Tokenize_list(docs_list): # must input list [ , ] type tokenize_ = [] for doc_ in docs_list: tokenize_.append(spliter.nouns(doc_)) # tokenize_.append(spliter.morphs(doc_)) return tokenize_ def Tokenize_list_morphs(docs_list): # must input list [ , ] type tokenize_ = [] for doc_ in docs_list: tokenize_.append(spliter.nouns(doc_)) # tokenize_.append(spliter.morphs(doc_)) return tokenize_ def Tokenize_dict(docs_dict, keyname = 'content'):
def __init__(self): self.twitter = Twitter()
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def DB_to_table(self, DBname='intake', keyword='intake'): self.query = \ """ SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}' """.format(keyword) conn = pymssql.connect( "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname) df = pdsql.read_sql_query(self.query, con=conn) # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map)) # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")) conn.close() self.raw_data = df.as_matrix() # def hashtags_split(self, hashtags): # hashtags_split = [] # for i in hashtags: # hashtags_split.append(i.split('/')) # # hashtags_list = [] # # for i in hashtags_split: # temp = [] # for j in i: # if self.isHangul(j): # t_hashtags = j.translate(self.non_bmp_map) # temp.append(t_hashtags) # hashtags_list.append(temp) # self.hashtags_list = hashtags_list # # return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list = noun_list + adj_list + verb_list return morph_list, nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0
from ckonlpy.tag import Twitter from konlpy.tag import Hannanum, Kkma, Komoran, Okt from eunjeon import Mecab test_text = "확진자와 접촉자는 다중이용시설 이용을 삼가하고, 사회적 거리두기 운동에 동참하며, 진료소와 마스크 착용을 자제해주시기 바랍니다." # Customized Konlpy twitter = Twitter() twitter.add_dictionary(["확진자", "접촉자", "다중이용시설", "사회적", "거리두기", "진료소"], "Noun") twitter.add_dictionary(["드립니다", "하시기", "해주시고", "해주시기", "지켜주십시오"], "Verb") print(f"Customized Konlpy : {twitter.nouns(test_text)}") # Hannanum hannanum = Hannanum() print(f"Hannanum : {hannanum.nouns(test_text)}") # Kkma kkma = Kkma() print(f"Kkma : {kkma.nouns(test_text)}") # Komoran komoran = Komoran() print(f"Komoran : {komoran.nouns(test_text)}") # Okt okt = Okt() print(f"Okt : {okt.nouns(test_text)}") # Mecab mecab = Mecab() print(f"Mecab : {mecab.nouns(test_text)}")
def naver(): from selenium import webdriver import re from selenium.webdriver.common.keys import Keys import time cr_name = 'naver' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) # 네이버 헤드라인 가져오는소스 date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) result = [] res = [] # 웹 셋팅 chrome = chromedriver.generate_chrome(driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Naver 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format( date) chrome.get(url) time.sleep(2) # scroll(3) for sun in range(4, 10): pr = chrome.find_elements_by_xpath( '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun)) for p in pr: result.append(p.find_elements_by_tag_name('a')) # print(result) for i, q in enumerate(result): for e in q: res.append(e.get_attribute('href')) http = list(set(res)) len(http) https = [] for idx in range(len(http)): if http[idx].find('popularDay') >= 0: continue else: https.append(http[idx]) files = pd.DataFrame() for i in range(len(https)): res = requests.get(https[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('._article_body_contents') files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'article_info' }).h3.text, 'Contents': re.sub( ' ', '', re.sub( ' ', '', re.sub( '\t', '', cleanText(body[0].text) [(cleanText(body[0].text)).find('{}') + 2:]))), 'link': https[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 저장 csv files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(date2), index=False, encoding='utf-8') # ------------------------------------- # 사전만들기 from ckonlpy.tag import Twitter t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') import nltk tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) from wordcloud import WordCloud import matplotlib.pyplot as plt import time date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) tmp_data = dict(data_1) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/naver_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
class PreprocessingText: def help(self): print("******PreprocessingText******") print("1) make_content_re(df['컬럼이름'](Series)) : 입력받은 열을 전처리 후 시리즈로 반환") print("2) add_noun_dict('list') : 명사 사전에 단어 추가") print("3) add_stopwords('list') : 불용어 사전에 단어 추가") print("4) tokenize(df['컬럼이름'](Series)) : 입력받은 열을 토큰화한 후 시리즈로 반환") print( "5) change_similar_words(토큰화된 문서(Series), 유의어 사전(dictionary)) : 유의어 사전을 기반으로 문서 내 유의어를 대표어로 변환하고, 변환된 문서를 시리즈로 반환한다." ) print("*****************************") def __init__(self): self.reg_reporter = re.compile('[가-힣]+\s[가-힣]*기자') # 기자 self.reg_email = re.compile( '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$') # 이메일 self.reg_eng = re.compile('[a-z]+') # 소문자 알파벳, 이메일 제거용, 대문자는 남겨둔다 self.reg_chi = re.compile("[\u4e00-\u9fff]+") # 한자 self.reg_sc = re.compile( "·|…|◆+|◇+|▶+|●+|▲+|“|”|‘|’|\"|\'|\(|\)|\W+") # 특수문자 self.reg_date = re.compile( '\d+일|\d+월|\d+년|\d+시|\d+분|\(현지시간\)|\(현지시각\)|\d+') # 날짜,시간,숫자 self.twitter_obj = Twitter() self.stopwords = [] self.noun_list = [] def preprocessing(self, doc): tmp = re.sub(self.reg_reporter, '', doc) tmp = re.sub(self.reg_email, '', tmp) tmp = re.sub(self.reg_eng, '', tmp) tmp = re.sub(self.reg_chi, '', tmp) tmp = re.sub(self.reg_sc, ' ', tmp) tmp = re.sub(self.reg_date, '', tmp) return tmp def make_content_re(self, data): pp_data = data.apply(self.preprocessing) return pp_data def add_noun_dict(self, noun_list): self.twitter_obj.add_dictionary(noun_list, 'Noun') self.noun_list.extend(noun_list) print("추가한 명사") print(noun_list) def add_stopwords(self, stopword_list): self.stopwords.extend(stopword_list) print("추가한 불용어") print(stopword_list) def change_similar_words(self, tokenized_docs, similar_words_dict): changed_docs = [] for doc in tokenized_docs: changed_doc = [] for word in doc: if word in similar_words_dict.keys(): changed_doc.append(similar_words_dict[word]) else: changed_doc.append(word) changed_docs.append(changed_doc) return changed_docs def tokenize(self, data): print('추가한 명사:', self.noun_list) print('불용어: ', self.stopwords) tokenized_doc = data.apply(lambda x: self.twitter_obj.nouns(x)) tokenized_doc_without_stopwords = tokenized_doc.apply( lambda x: [item.lower() for item in x if item not in self.stopwords]) tokenized_data = tokenized_doc_without_stopwords return pd.Series(tokenized_data)
idx2word = {idx: word for word, idx in word2idx.items()} # 가장 긴 샘플의 길이 story_max_len = np.max(story_len) question_max_len = np.max(question_len) return word2idx, idx2word, story_max_len, question_max_len # %% word2idx, idx2word, story_max_len, question_max_len = preprocess_data( train_data, test_data) print(word2idx) twitter = Twitter() twitter.add_dictionary('은경이', 'Noun') twitter.add_dictionary('경임이', 'Noun') twitter.add_dictionary('수종이', 'Noun') # print(twitter.morphs('은경이는 화장실로 이동했습니다.')) # print(twitter.morphs('경임이는 정원으로 가버렸습니다.')) # print(twitter.morphs('수종이는 복도로 뛰어갔습니다.')) # print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.')) # print(twitter.morphs('수종이는 사무실로 갔습니다.')) # print(twitter.morphs('은경이는 침실로 갔습니다.')) def tokenize(sent): return twitter.morphs(sent)
import string import csv from ckonlpy.tag import Twitter twitter = Twitter() f = open("Han.txt", mode = "r", encoding = "utf-8") c = csv.writer(open("HanKeoRyeKonlPy.csv","w",encoding = "utf-8")) for t in f: c.writerow(twitter.morphs(t))
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def hashtags_split(self, hashtags): hashtags_split = [] for i in hashtags: hashtags_split.append(i.split('/')) hashtags_list = [] for i in hashtags_split: temp = [] for j in i: if self.isHangul(j): t_hashtags = j.translate(self.non_bmp_map) temp.append(t_hashtags) hashtags_list.append(temp) self.hashtags_list = hashtags_list return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list = noun_list + adj_list + verb_list return morph_list, nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0
def kor_preprocessing(q, q3, df): data = df.copy().reset_index(drop=True) temp = [] data = data.str.join('').str.replace(r"\n", "") data = data.str.replace(pat=r'[^\w]', repl=r'', regex=True) for i in range(len(data)): okt = Okt() new = okt.normalize(data[i]) # 정규화 new = only_hangle(new) new = emoticon_normalize(new, num_repeats=2) # ㅋㅋㅋㅋㅋㅋ -> ㅋㅋ, ㅠㅠㅠㅠ -> ㅠㅠ data[i] = data[i].replace(" ", '') spacing = Spacing() new = spacing(data[i]) # Apply space preprocessing try: new = spell_checker.check(new).checked # 오타 처리 except: print(new) temp.append(new) data = pd.Series(temp) # 신조어 사전 추가 token = Twitter() # 추가 adding_noun = [ '식후감', '존맛', '개존맛', '꿀맛', '짱맛', '요기요', 'ㅈㅁㅌ', 'ㅃㄲ', '소확행', '민초', '치밥', '소맥', '넘사벽', '순삭', '빛삭', '광삭', '반반무', '반반무마니', '솔까말', '스압', '썸남', '썸녀', 'jmt', 'jmtg', 'jmtgr', 'JMT', 'JMTG', 'JMTGR', '배불띠', '돈쭐', '쿨타임', '닥추', '강추', '유튜버', '홧팅', '팟팅', '단짠단짠', '단짠', '맵단', '맵달', '맛도리', '부조캐', '밍밍쓰', '노맛', '존노맛', '최애', '차애', '섭스', '서빗', '프레젠또', '존맛탱', '개존맛탱', '존맛탱구리', '킹맛', '댕맛', '뿌링클', '로제', '오레오', '로투스', '사장님', '싸장님', '사장뉨' '소소한', '프라프치노', ' 프라푸치노', '갓성비', '커엽', '굳잡', '굿잡', '굳굳', '이벵트', '이벵' ] for i in adding_noun: token.add_dictionary(i, 'Noun') # 명사 추가 adding_verb = ['맛나', '마이쩡', '마이쪙', '마시땅', '마시쩡', '마시쪙'] for i in adding_verb: token.add_dictionary(i, 'Noun') # 동사 추가 token.add_dictionary('잘', 'Noun') # 동사 추가 token = Okt() # 불용어 사전 with open('stop.txt', 'rt', encoding='UTF8') as f: stopwords = f.read().replace('\n', ' ') stopwords = stopwords.split(' ') result = [] for i in range(len(data)): review = data[i] temp = (token.morphs(review, norm=True, stem=True)) stopwords_removed_sentence = [ word for word in temp if not word in stopwords ] # 불용어 제거 sentence = '' for s in stopwords_removed_sentence: sentence = sentence + ' ' + s result.append(sentence) q.put(result) q3.put(df)
from itertools import repeat import re from fileIO import openJsonFile, closeJsonFile, saveError from dbIO import readDB, insertDB import nltk from nltk.corpus import stopwords from konlpy.tag import Okt from ckonlpy.tag import Twitter, Postprocessor from ckonlpy.utils import load_wordset, load_ngram # nltk.download('punkt') # nltk.download('stopwords') okt = Okt() twitter = Twitter() stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt', encoding='ANSI') customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt', encoding='ANSI') stopwordsEN = customStopwordsEN.union(set(stopwords.words('english'))) ngrams = load_ngram('cleansing_data/korean_ngram.txt') userdicts = load_wordset('cleansing_data/korean_user_dict.txt') twitter.add_dictionary(list(userdicts), 'Noun', force=True) def getJobGroups(): res = requests.get( 'https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.latest_order&years=-1&locations=all' ) html = res.text
def comm_date(comm_name, dates_array): for dates in dates_array: client = MongoClient('mongodb://*****:*****@\·\"\"\%\,\(\)\&]+', ' ', text) text = re.sub('[\n\xa0\r]+', ' ', text) # 토큰화 token = twitter.nouns(text) # 명사만 if token != []: tokened_texts.extend(token) print(dates, i, '/', len(idate_with_all)) pickle_name = str(comm_name) + str(dates) with open(pickle_name, "wb") as fw: pickle.dump(tokened_texts, fw) print('저장완료')
import pandas as pd #from konlpy.tag import Okt from ckonlpy.tag import Twitter from collections import Counter import os import re print(os.getcwd()) client = MongoClient('mongodb://13.125.221.134:9046') db = client.mongodb # 컬렉션 객체 가져오기 # ilbe_coll = db['cleaned_ilbe'] coll = db['realnavernews'] twitter = Twitter() keyword = '회담' #cursor = coll.find({'cno' : {'$regex' : keyword}}).limit(1) cursor = coll.find({}).sort([('_id', 1)]) def news_check(): for text in cursor: yield text gen = news_check()
def twitter(): cr_name = 'twitter' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) import time import nltk keyword = Main.text() # 웹 셋팅 chrome = chromedriver.generate_chrome( driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Twitter 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword) chrome.get(url) time.sleep(3) # text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div > main > div > div > div > div > div > div:nth-child(2) > div') # for i in range(15): # for q in range(3): # body = chrome.find_element_by_css_selector('body') # body.send_keys(Keys.PAGE_DOWN) # time.sleep(1) # for ttt in tqdm(text2): # result.append(ttt.text) # time.sleep(1) # # # result2 = [] # for i in range(len(result)): # if i % 2 == 0: # result2.append(result[i]) # print(len(result2)) # # result3 = [] # for i in range(len(result2)): # result3.append(cleanText(result2[i])) body = chrome.find_element_by_css_selector('body') text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div') for i in range(10): for q in range(3): body.send_keys(Keys.PAGE_DOWN) time.sleep(1) for ttt in tqdm(text2): result.append(re.sub('\n', '', ttt.text)) t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') tokens_ko = [] for i in range(len(result)): tokens_ko.append(t.nouns(result[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) # 텍스트파일에 댓글 저장하기 file = open(text_save_path+'/twitter{}.txt'.format(date2), 'w', encoding='utf-8') for review in result: file.write(review + '\n') file.close() tmp_data = dict(data) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path+"/twitter_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
test_stories, test_questions, test_answers = read_data(TEST_FILE) print('훈련용 스토리의 개수 :', len(train_stories)) print('훈련용 질문의 개수 :',len(train_questions)) print('훈련용 답변의 개수 :',len(train_answers)) print('테스트용 스토리의 개수 :',len(test_stories)) print('테스트용 질문의 개수 :',len(test_questions)) print('테스트용 답변의 개수 :',len(test_answers)) train_stories[3572] train_questions[3572] train_answers[3572] twitter = Twitter() print(twitter.morphs('은경이는 화장실로 이동했습니다.')) print(twitter.morphs('경임이는 정원으로 가버렸습니다.')) print(twitter.morphs('수종이는 복도로 뛰어갔습니다.')) print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.')) print(twitter.morphs('수종이는 사무실로 갔습니다.')) print(twitter.morphs('은경이는 침실로 갔습니다.')) twitter.add_dictionary('은경이', 'Noun') twitter.add_dictionary('경임이', 'Noun') twitter.add_dictionary('수종이', 'Noun') print(twitter.morphs('은경이는 화장실로 이동했습니다.')) print(twitter.morphs('경임이는 정원으로 가버렸습니다.')) print(twitter.morphs('수종이는 복도로 뛰어갔습니다.'))
#형태소 분석 import os import json #from konlpy.tag import Okt from ckonlpy.tag import Twitter BASE_DIR = os.path.dirname(os.path.abspath(__file__)) file = open(os.path.join(BASE_DIR + '/t05/news1.txt'), 'r', encoding='UTF8') text = file.read() file.close() #okt = Okt() twitter = Twitter() twitter.add_dictionary('K리그', 'Noun') content = twitter.morphs(text) num = 1 voca_dict = dict() for word in content: voca_dict[num] = word num = num + 1 with open(os.path.join(BASE_DIR + '/t06', 'vocab.json'), 'w+', encoding='UTF-8-sig') as json_file: json.dump(voca_dict, json_file, ensure_ascii=False)