def index(request): context = {} # History 경로 생성 homepath = os.path.expanduser("~") abs_chrome_path = os.path.join(homepath, 'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'Default', 'History') # History 파일 복사 shutil.copyfile(abs_chrome_path, abs_chrome_path+"_sample") # 복사본 데이터 추출 con = sqlite3.connect(abs_chrome_path+"_sample") cursor = con.cursor() cursor.execute("SELECT term FROM keyword_search_terms") term_data = cursor.fetchall() # 형태소 분석 kiwi = Kiwi() kiwi.prepare() word_list = [] for term in term_data: for word, tag, _, _ in kiwi.analyze(term[0], top_n=1)[0][0]: if tag in ['NNG','NNP','NNB','SL']: word_list.append(word) # count counts = Counter(word_list) tags = counts.most_common() # wordcloud mask = plt.imread("./static/images/mask.jpg") wc = WordCloud(font_path='./static/webfonts/NanumBarunGothicBold.ttf', background_color='white', width=800, height=800, mask=mask) cloud = wc.generate_from_frequencies(dict(tags)) plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud, interpolation="bilinear") plt.savefig("./static/images/wordcloud_keyword.png", dpi=300, bbox_inches='tight') # 상위 9개 단어 top9_list = [] for rank in range(9): top9 = {} top9['rank'] = rank+1 top9['word'] = tags[rank][0] top9['count'] = tags[rank][1] top9_list.append(top9) context['top9'] = top9_list return render(request, 'mainapp/index.html', context)
def test_kiwi(environ): _, tagger = environ kiwi = Kiwi() kiwi.prepare() for _, line in EXAMPLES: res1 = tagger.tagSentence(line)[0] res2 = kiwi.analyze(line) res1 = [(m.surface, m.originalTag) for w in res1 for m in w] res2 = [m[:2] for m in res2[0][0]] assert res1 == res2
def prepare_kiwi(train_file): """ input: train file i.e. corpora output: kiwi model """ numThread = 4 kiwi = Kiwi(numThread) reader = ReaderExam(train_file) minCount = 5 maxWordLength = 6 minScore = 0.25 kiwi.extractWords(reader.read, minCount, maxWordLength, minScore) kiwi.prepare() return kiwi
class kiwi_dictionary_n_fuction: def __init__(self, path): self.kiwi = Kiwi(options=Option.LOAD_DEFAULT_DICTIONARY | Option.INTEGRATE_ALLOMORPH) self.kiwi.load_user_dictionary(path) self.kiwi.prepare() self.josa = [ 'JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC' ] def get_noun(self, sen): _, self.nn_list, _, _ = self.generate_morp_word(sen, 1) return self.nn_list # 문장 전체를 리스트형태로 띄어쓰기만 해서 리턴 def get_all_token(self, sen): morp_list, _, _, _ = self.generate_morp_word(sen, 1) return morp_list # 문장 전체를 토큰화 후 문자열 리턴 def get_token_str(self, sen): morp_list, _, _, _ = self.generate_morp_word(sen, 1) string = ''.join(morp_list) # if '\\' in self.string: # self.string = self.string.translate({ord('\\'):'\\\\'}) return string def get_vv(self, sen): _, _, vv_list, _ = self.generate_morp_word(sen, 1) return vv_list def get_nn_list(self, sen): _, nn_list, _, _ = self.generate_morp_word(sen, 1) return nn_list # 조사 없애고 나머지부분 문자열형태로 리턴. def get_no_josa_token( self, sen ): # EX) 관찰 가능 하 고 처리 가능 하 ᆫ 범위 내 문장 입력 받 어 정해진 형태 출력 제한 되 ᆫ 시간 내 출력 하 어야 하 ᆫ다는 제약 적 용도 고려 하 ᆫ 관점 이 다 . _, _, _, nosa_list = self.generate_morp_word(sen, 1) string = ''.join(nosa_list) return string # 튜플 리스트 리턴 def k_pos( self, sentence ): # [('관찰', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('고', 'EC'), ('처리', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('ᆫ', 'ETM'), ('범위', 'NNG')] tuple_list = [] result = self.kiwi.analyze(sentence, 1) for i in result[0][0]: word, pos = i[0], i[1] new_tuple = (word, pos) tuple_list.append(new_tuple) return tuple_list def k_analyze(self, sentence): return self.kiwi.analyze(sentence, 1) # 단순 단어만 리스트형태로 리턴 def k_morphs(self, sen): # ['관찰', '가능', '하', '고', '처리', '가능', '하' ... ..] token_list = [] result = self.kiwi.analyze(sen, 1) for i in result[0][0]: token_list.append(i[0]) return token_list # 문장에서 형태소를 뽑아냄 def generate_morp_word(self, sentence, analyze_num): try: result = self.kiwi.analyze(sentence, analyze_num) morp_word_list = [] morp_nn_list = [] morp_vv_list = [] morp_not_josa_list = [] for i in range(0, analyze_num): morp_word = '' morp_nn = '' morp_vv = '' morp_not_josa = '' nn = [] for word in result[i][0]: morp_word += word[0] morp_word += ' ' if word[1] not in self.josa: morp_not_josa += word[0] morp_not_josa += ' ' if word[1] in ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL']: morp_nn += word[0] morp_nn += ' ' nn.append(word[0]) elif word[1] in ['VV', 'VA', 'VX', 'VCP', 'VCN']: morp_vv += word[0] morp_vv += ' ' else: pass morp_word_list.append(morp_word) morp_nn_list.append(morp_nn) morp_vv_list.append(morp_vv) morp_not_josa_list.append(morp_not_josa) return morp_word_list, morp_nn_list, morp_vv_list, morp_not_josa_list except Exception as e: print(e) print("### ERROR 형태소 분석기 부분 에 뭐가 잘못된게 있는듯 ERROR ### ") def __del__(self): print("EXIT kiwi")
class Changer(object): def __init__(self): try: self.kiwi = Kiwi() self.kiwi.prepare() except: print("[INFO] please install kiwipiepy ") self.replace = formaldic() self.utils = Utils() def dechanger(self, stc): """ change formal speech to informal Args : str """ pattern = r'하세요|이예요|이에요|에요|예요|시겠어요|죠|합니까|습니까' pattern = re.compile(pattern) result = [] stc = self.utils._remove_blank(stc) stc = self.utils._clean_up_tokenization(stc) if len(re.findall(pattern, stc)) > 0: tokens = self.kiwi.analyze(stc.replace(" ","|")) key = informaldic().keys() lk = list(key) key2 = abnormaldic().keys() ak = list(key2) tmp = [] for token in tokens[0][0]: if token[:2] in lk: #key로 value token = informaldic().get(token[:2]) if token[:2] in ak: token = abnormaldic().get(token[:2]) tmp.append(token) changed = '' for t in tmp: if isinstance(t[0], tuple): for i in range(len(t[0])): changed += hgtk.text.decompose(t[i][0]) else: changed += hgtk.text.decompose(t[0]) one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ') if one_char.search(changed): words = changed.split('ᴥ') for idx in range(1,len(words)): # 앞 글자가 종성이 없음 if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2: #앞 글자에 합침 words[idx - 1] = words[idx-1]+words[idx] words[idx] = "" # 있음 elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3: shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ'] ep = ['ㄹ'] if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 : if words[idx - 1][-1] in shp : if words[idx].count("|") > 0: words[idx] = "|습" else: words[idx ] = "습" continue else : if words[idx].count("|") > 0: words[idx] = "|입" else: words[idx] = "입" # words[idx] = "" elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'): if words[idx-1].count("|") >0 : words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx] else : words[idx - 1] = words[idx - 1][:2] + words[idx] # 지움 words[idx] = "" elif words[idx] =='ㄹ': if words[idx].count("|") > 0: words[idx] = "|일" else: words[idx] = "일" changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ" # For cases which wasn't covered, changed = self._makePretty(changed) changed = hgtk.text.compose(changed).replace("|"," ") # excetion 처리 try: if changed[-1] == '요': changed = re.sub('요', '', changed) changed = re.sub('그렇죠', '', changed) except: pass result.append(changed) else: try: result.append(stc) except: pass return result[0] def _makePretty(self, line): """ Convert the jaso orderings which wasn't properly covered by Jaso restructuring process of function Mal_Gillge_Haeraing :param line: jaso orderings which wasn't properly covered :return: Converted jaso ordering """ test = line test = test.replace("ᴥㅎㅏᴥㅇㅏᴥ", "ᴥㅎㅐᴥ") test = test.replace("ㅎㅏᴥㅇㅏᴥㅇㅛᴥ", "ᴥㅎㅐᴥ") test = test.replace("ㅎㅏᴥㄴㅣᴥㄷㅏᴥ", "ㅎㅏㅂᴥㄴㅣᴥㄷㅏᴥ") test = test.replace("ㅎㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ") test = test.replace("ㄴㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ") test = test.replace("ㄱㅏᴥㅇㅏㅆᴥ", "ᴥㄱㅏㅆᴥ") test = test.replace("ㅇㅣᴥㄴㅣᴥ", "ᴥㄴㅣᴥ") test = test.replace("ㄴㅓㄹㄴᴥ","ㄴㅓㄴᴥ") test = test.replace("ㄱㅡᴥㄹㅓㅎᴥㅇㅓᴥ","ㄱㅡᴥㄹㅐᴥ") test = test.replace("ㅡᴥㅇㅏᴥ","ㅏᴥ") test = test.replace("ㄱㅓㄹᴥㄴㅏᴥㅇㅛᴥ", "ㄱㅓㄴᴥㄱㅏᴥㅇㅛᴥ") return test def changer(self, text): """ change informal speech to formal speech Args : str """ tokens = self.kiwi.analyze(text.replace(" ","|")) key = formaldic().keys() key2 = abnormaldic().keys() lk = list(key) ak = list(key2) num = len(tokens[0][0]) result = [] for idx, token in enumerate(tokens[0][0]): if idx > int(num*0.8): if token[:2] in lk: #key로 value token = formaldic().get(token[:2]) result.append(token) else: if token[:2] in ak: token = abnormaldic().get(token[:2]) result.append(token) else: result.append(token[:2]) else: if token[:2] in ak: token = abnormaldic().get(token[:2]) result.append(token) else: result.append(token[:2]) # change tuple to text changed = '' for t in result: if isinstance(t[0], tuple): for i in range(len(t[0])): changed += hgtk.text.decompose(t[i][0]) else: changed += hgtk.text.decompose(t[0]) # Restructuring sentence from jaso ordering. one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ') if one_char.search(changed): words = changed.split('ᴥ') for idx in range(1,len(words)): # 앞 글자가 종성이 없음 if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2: #앞 글자에 합침 words[idx - 1] = words[idx-1]+words[idx] words[idx] = "" # 있음 elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3: shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ'] ep = ['ㄹ'] if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 : if words[idx - 1][-1] in shp : if words[idx].count("|") > 0: words[idx] = "|습" else: words[idx ] = "습" continue else : if words[idx].count("|") > 0: words[idx] = "|입" else: words[idx] = "입" # words[idx] = "" elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'): if words[idx-1].count("|") >0 : words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx] else : words[idx - 1] = words[idx - 1][:2] + words[idx] # 지움 words[idx] = "" elif words[idx] =='ㄹ': if words[idx].count("|") > 0: words[idx] = "|일" else: words[idx] = "일" changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ" # For cases which wasn't covered, changed = self._makePretty(changed) changed = hgtk.text.compose(changed).replace("|"," ") return changed def addData(self, key, val): """ Add new data to dictionary, changer dictionary update :param key: key to be added into Dictionary self.replace :param val: Value to be added into Dictionary self.replace :return: None """ with open('dictionary.py', 'r', encoding='utf-8') as f: data = f.read() lines = data.split("\n") lines[-2] += ',' lines[-1] = " " + str(key) + ": " + str(val) with open('dictionary.py', 'w', encoding='utf-8') as f: for i in range(len(lines)): f.write(lines[i] + "\n") f.write(" }") def checker(self, result): """ Check the abnormal setnecnes and remove them. Args : result, updated, idx : list """ updated = [] idxes = [] normal = ['요', '까', '다', '죠', '가'] for idx, stc in enumerate(result): try: if stc[-1] not in normal: print(f"[INFO] Abnormal Sentence, remove {idx}....") idxes.append(idx) else: updated.append(stc) except: idxes.append(idx) return updated, idxes
data['token'].isnull().sum() # overview에서 Null 값을 가진 경우에는 값 제거 data['token'] = data['token'].fillna('') #%% 형태소 분석이 끝난 후의 질문들로 학습 벡터 생성 from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() tfidf_matrix = tfidf.fit_transform(data['token']) # overview에 대해서 tf-idf 수행 print(tfidf_matrix.shape) # %% from kiwipiepy import Kiwi kiwi = Kiwi() kiwi.load_user_dictionary(r'./userDict.txt') kiwi.prepare() def generate_morp_word(sentence,analyze_num): try: result = kiwi.analyze(sentence, analyze_num) print(result) morp_word_list =[] morp_nn_list=[] morp_vv_list=[] for i in range(0, analyze_num): morp_word = '' morp_nn='' morp_vv='' #print(i) for word in result[i][0]: morp_word += word[0]
from kiwipiepy import Kiwi, Option from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter, defaultdict import codecs tagger = Kiwi() tagger.prepare() # def flat(content): # return ["{}/{}".format(word,tag) for word, tag in tagger.pos(content) rsc = r'E:\Programming\python\창회선배스터디\Morpheme_Cloud\자료\토지2.txt' target_corpus = codecs.open(rsc, 'r', encoding='utf-8') #텍스트 태깅 작업 tagged_temp = [] # with open(rsc, 'r', encoding="utf8") as kr_f: # for line in kr_f: # line = line.strip() # tagged_temp += flat(line) for i in target_corpus: i = i.strip() temp_tagging = [x[0] for x in tagger.analyze(i, top_n=1)] inner_temp = ["{}/{}".format(word, tag) for word, tag, score1, score2 in temp_tagging[0]] tagged_temp.append(tuple(inner_temp)) print(tagged_temp[:3])