from khaiii import KhaiiiApi api = KhaiiiApi() for i in range(10): s = str(input()) res = [] for word in api.analyze(s): res.append(str(word)) res = [word.split('\t')[1] for word in res] print(res)
def __init__(self, df): self.df = df self.lex = [] self.tag = [] self.api = KhaiiiApi()
class Tag_parser: def __init__(self, soup, url): self.tags = [] self.titles = {} self.contents = {} self.stopwords = ['<!', 'script', 'function', '#'] self.api = KhaiiiApi( '/home/hwang/khaiii/khaiii/build/lib/libkhaiii.so.0.4', '/home/hwang/khaiii/khaiii/build/share/khaiii') #self.tables=table_reader.get_all_tables(soup) #print(url,self.tables) self.table_count = 0 ''' self.url = 'http://hosp.ajoumc.or.kr/MedicalInfo/HospitalRoomGuide.aspx' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome('./chrome/chromedriver_linux64/chromedriver', chrome_options=chrome_options) driver.implicitly_wait(3) driver.set_page_load_timeout(100) driver.get(self.url) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') ''' self.recursiveChildren(soup) def isstopWord(self, args): for word in self.stopwords: if word in args or '\n' == args or str( type(args) ) == "<class 'bs4.element.Comment'>": # ignore comment return True return False def imgTagparse(self, args): if 'alt' in args.attrs.keys(): return args.attrs['alt'] else: return "" def dictvalue_to_list(self, dicts): res_list = [] for v in dicts.values(): if v is None: v = [] elif str( type(v)) == "<class 'bs4.element.NavigableString'>" or str( type(v)) == "<class 'str'>": v = list(v.strip().split()) res_list.append(tuple(v)) return tuple(res_list) def extract_words(self, text): temp = [] if text == '': return temp for word in self.api.analyze(text): for morph in word.morphs: if 'NN' in morph.tag: temp.append(morph.lex) if len(temp): return temp def parents_name(self, link, tag): name = [] if (type(link) == type('')): return False for p in link.parents: name.append(p.name) if (set(tag) & set(name)): return True else: return False def recursiveChildren(self, x): try: for child in x.recursiveChildGenerator(): if self.isstopWord(child): continue name = getattr(child, "name", None) if name == 'img': child = self.imgTagparse(child) self.tags.append(name) name = None if name is not None: if 'li' == name: ##### insertion to contents dict code self.titles['word_from_contents'] = self.extract_words( child.get_text().strip()) self.contents[self.dictvalue_to_list( self.titles)] = [child.get_text().strip()] ###### # elif 'table' == name: # ############ write code here . ######################## # temp_table=self.tables[self.table_count] # self.table_count+=1 # if not temp_table: # continue # for line in temp_table: # try: # self.titles['word_from_contents'] = self.extract_words(line) # self.contents[self.dictvalue_to_list(self.titles)] = [line] # #print(self.titles,' : ', self.contents[self.dictvalue_to_list(self.titles)]) # except: # pass # pass else: self.tags.append(name) else: if child.isspace() or len( self.tags ) == 0 or child == '': # lear node, don't print spaces or non-tag continue else: if self.parents_name(child, ['li', 'table']): continue if 'h' in self.tags[-1] or 'img' in self.tags[ -1]: # or 'span' in self.tags[-1]: # append headline if 'img' in self.tags[-1] and 'h' in self.tags[ -2]: # img tag in headline self.titles[self.tags[-2]] = child elif 'h' in self.tags[-1]: # just headline self.titles[self.tags[-1]] = child else: self.titles[ 'word_from_contents'] = self.extract_words( child) self.contents[self.dictvalue_to_list( self.titles)] = [ child.strip() ] # set contents {title : contents} print( self.titles, ' : ', self.contents[self.dictvalue_to_list( self.titles)]) if len(self.tags): self.tags.pop(-1) except Exception as ex: print("error ", ex) return
class Tokenizer: def __init__(self): self._api = KhaiiiApi() # 불용어 정의 self._stopwords = [ '말', '곡', '때', '음악', '노래', 'a', 'an', 'the', 'in', 'on', 'at', 'by', 'of' ] # 대체어 self._alternative = [ ('k-pop', 'kpop'), ('k팝', 'kpop'), ('j-pop', 'jpop'), ('r&b', 'rnb'), ('알앤비', 'rnb'), ('락', 'rock'), ('재즈', 'jazz'), ('째즈', 'jazz'), ('힙합', 'hiphop'), ('hip-hop', 'hiphop'), ('hip-hap', 'hiphop'), ('클래식', 'classic'), ('발라드', ' 발라드 '), ('라붐', 'laboum'), ('뉴에이지', 'newage'), ] def tokenize(self, sentence): clean_sentence = sentence.lower() # 대체어로 대체 for words in self._alternative: clean_sentence = re.sub(words[0], words[1], clean_sentence) # 영어는 소문자로, 그리고 숫자/영어/한글을 제외한 특수문자 제거(ㅋ 포함) clean_sentence = re.sub('[^0-9a-z가-힣]', ' ', clean_sentence) morphs = [] try: for word in self._api.analyze(clean_sentence): morphs.extend(self._word_tokenize(word)) except: morphs.clear() #print('[WARNING] Khaiii can not tokenize...({})'.format(sentence)) # 불용어 제거 keyword = {lex for lex, _ in morphs if not lex in self._stopwords} return list(keyword) def _word_tokenize(self, word): morphs = [] prev_lex = '' prev_tag = '' for morph in word.morphs: # 복합명사는 복합명사 그대로 저장 if morph.tag == 'NNG' and prev_tag == 'NNG': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'NNG' and prev_tag == 'NNP': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'NNP' and prev_tag == 'NNG': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'NNP' and prev_tag == 'NNP': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'NNG' and prev_tag == 'XR': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'NNP' and prev_tag == 'XR': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'XR' and prev_tag == 'NNG': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'XR' and prev_tag == 'NNP': morphs.append((morphs.pop()[0] + morph.lex, morph.tag)) elif morph.tag == 'NNG' and prev_tag == 'IC': morphs.append((prev_lex + morph.lex, morph.tag)) elif morph.tag == 'NNP' and prev_tag == 'IC': morphs.append((prev_lex + morph.lex, morph.tag)) # 일반명사 elif morph.tag == 'NNG': morphs.append((morph.lex, morph.tag)) # 고유명사 elif morph.tag == 'NNP': morphs.append((morph.lex, morph.tag)) # 외국어 elif morph.tag == 'SL': morphs.append((morph.lex, morph.tag)) # 어근 elif morph.tag == 'XR': morphs.append((morph.lex, morph.tag)) # 동사 : 2자리 이상만 elif morph.tag == 'VV' and len(morph.lex) > 1: morphs.append((morph.lex, morph.tag)) # 형용사 : 2자리 이상만 elif morph.tag == 'VA' and len(morph.lex) > 1: morphs.append((morph.lex, morph.tag)) # 숫자 : 2자리 이상만 elif morph.tag == 'SN' and len(morph.lex) > 1: morphs.append((morph.lex, morph.tag)) # 숫자 + 의존명사 (예, 2000년대) elif morph.tag == 'NNB' and prev_tag == 'SN': morphs.append((prev_lex + morph.lex, morph.tag)) prev_lex = morph.lex prev_tag = morph.tag return morphs
class SoraModule(Module): contents = [ "그걸 말이라고 하냥!?", "당연하다냥!", "안 된다냥..", "언젠가는 될 거다냥!", "다시 한번 물어봐냥!", "된다냥!", ] predefined_content = { "당첨": ["그럴 수도 있을 것 같다냥!", "그게 된다고 생각하냥?"], "먹다": ["그래도 되긴 하지만... 살이 찌지 않을까냥?", "맛있겠다냥!!!ㅜ"], "맛있": ["맛있겠다냥!!!ㅜ", "난 싫다냥!", "그건 좋다냥!"], "하다": ["괜찮다냥!"], "사다": ["돈이 없다냥!"], "호불호": ["좋다냥!", "싫다냥!", "완전 좋다냥!!", "완전 싫다냥!!!"], } required_morph_types = "NVMJEXS" tags = [ ( "가", "VV", "가다", ), ( "싶", "VX", "싶다", ), ( "먹", "VV", "먹다", ), ( "당첨", "NNG", "당첨", ), ( "쫒", "NNG", "당첨", ), ( "하", "VX", "하다", ), ( "사", "NNG", "사다", ), ( "맛있", "VA", "맛있다", ), ( "좋아", "IC", "호불호", ), ( "좋아하", "VV", "호불호", ), ( "싫어", "IC", "호불호", ), ( "싫어하", "VV", "호불호", ), ] api = KhaiiiApi() @classmethod def if_match_tag(cls, morph): for tag in cls.tags: if morph.lex == tag[0] and morph.tag == tag[1]: return tag return None async def on_message(self, message: discord.Message) -> bool: if message.content.startswith("여름아") and message.content.endswith("?"): ss = self.api.analyze(message.content.lstrip("여름아")) contexts = [ # (required_morph_type, '가다'), ] for required_morph_type in self.required_morph_types: for s in ss: for morph in s.morphs: match = self.if_match_tag(morph) if match is None: continue lex, tag, match = match if match and required_morph_type in tag: contexts.append(( required_morph_type, match, )) break print((message.content, " ".join([str(s) for s in ss]), contexts)) choices = [c[1] for c in contexts] key = random.choice(choices) if len(choices) > 0 else "" content = random.choice( self.predefined_content.get(key, self.contents)) await message.channel.send("<@{}> {}".format( message.author.id, content)) return False
from khaiii import KhaiiiApi def Analyze(self, text, SEP=' + '): res = self.analyze(text) f = lambda x: x.__str__().split('\t')[1] return SEP.join(list(map(f, res))) if __name__ == '__main__': khai3 = KhaiiiApi() setattr(khai3.__class__, 'Analyze', Analyze) print(khai3.Analyze('아버지가방에들어가신다.')) # 아버지/NNG + 가/JKS + 방/NNG + 에/JKB + 들어가/VV + 시/EP + ㄴ다/EF + ./SF
import pandas as pd # CSV 파일을 읽고 다루기 위한 Pandas API import unittest # 공백 문자열을 '+" 문자로 바꾸기 위해 사용된 Unittest API from khaiii import KhaiiiApi # 형태소 분석기 Khaiii API from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler # 갱신된 정보를 다루는 Updater 객체, 정해진 Command를 다루는 CommandHandler 객체, 메세지를 다루는 MessageHandler, Text를 필터링해주는 Filters 객체, 어떤 행위에 대한 사용자의 대답을 다루는 CallbackQueryHandler from telegram import InlineKeyboardButton, InlineKeyboardMarkup # Telegram Button Interface를 구현가능한 InlineKeyboardButton, InlineKeyboardMarkup 객체 my_token = '846490622:AAHzkPwpgOlnpJJKCn_5oWn3hcV4EIXl3-U' # Telegram Music Kim Bot의 token 값 bot = telegram.Bot(token=my_token) # 해당 token으로 Bot 생성 logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) # 기본 Logging 형식을 설정 api = KhaiiiApi() # 형태소 분석기 KhaiiiApi() sentiment_data = pd.read_csv( "KoreanSample.csv", encoding='CP949') # 감성어 기분석 사전인 KoreanSample.csv 파일을 CP949 인코딩 형식으로 읽어옴 music_data = pd.read_csv( "SpotifyFeatures.csv" ) # Spotify Music Big Data인 SpotifyFeatures.csv 파일을 읽어옴 final_mdf = pd.read_csv( "Ranged_SpotifyFeatures.csv" ) # Clustering된 Spotify Music Big Data인 Ranged_SpotifyFeatures.csv 파일을 읽어옴 def start(bot, update): # Bot에 /start Command 입력 혹은 Bot에 처음 시작하기를 클릭할 때 # Button Interface를 형성하기 위해 각 Button에 Callback Data를 연결시킨 후 Markup 해줌 show_list = []
from khaiii import KhaiiiApi tokenizer = KhaiiiApi() data = tokenizer.analyze("아버지가방에들어가신다") tokens = [] for word in data: tokens.extend([str(m).split("/")[0] for m in word.morphs])
#!/usr/bin/python import util from khaiii import KhaiiiApi import re path = "../data/191017/1630/D_K_03" filelist = util.get_filelist(path) file = filelist[0] with open(file, 'r', encoding='utf-8') as fp: khaiii = KhaiiiApi() strList = [] while True: line = fp.readline() if not line: break if line == "\n": continue # 전처리 잘못 된 부분이 있어서 임시로 넣음. 추후 빼도 무관. line = re.sub("\xa0", " ", line).strip() if line == "": continue # 나중에 고도화 및 모듈화 해야할 곳 ############################################################## #형태소 분석 전 전처리 작업 (형태소 분석에 악영향을 주는 기호 삭제)
def khaiii_tokenizer(sentence, tokenizer=KhaiiiApi()): pass
from khaiii import KhaiiiApi import re p = re.compile('\(.+\)') p2 = re.compile('\[.+\]') api = KhaiiiApi() pre_data = open('pre.txt', 'r', encoding='utf-8') post_data = open('post.txt', 'r', encoding='utf-8') output_data = open('preprocessed_data.txt', 'w', encoding='utf-8') pre_data_lines = pre_data.readlines() post_data_lines = post_data.readlines() cnt = 0 for pre_data_line in pre_data_lines: pre_key = ' '.join(pre_data_line.split(' ')[0:2]) for post_data_line in post_data_lines[cnt:]: post_key = ' '.join(post_data_line.split(' ')[0:2]) if pre_key == post_key: cnt += 1 pre_data_line = pre_data_line.replace('\n', '') post_data_line = post_data_line.replace('\n', '') pre_sent = ' '.join(pre_data_line.split(' ')[2:]) post_sent = ' '.join(post_data_line.split(' ')[2:]) maches = re.findall(p, pre_sent) for mach in maches: pre_sent = pre_sent.replace(mach, '') maches = re.findall(p2, pre_sent) for mach in maches:
class MorphAnalyzer(): api = KhaiiiApi() def morphAnalyze(self, content): result = list() #print(content,'\n') for word in self.api.analyze(content): for morph in word.morphs: result.append([morph.lex, morph.tag]) return result def morphKeywording(self, content): keyword = list() for word in content: if (word[1] in ['NNG', 'NNP', 'NNB']): word[1] = 'NN' for word in content: #단일명사가 5글자 이상인 경우 if (word[1] == 'NN' and len(word[0]) >= 5): keyword.append(word[0]) group = list() for k, g in groupby(content, lambda x: x[1]): # Groupby [(태그,단어),(태그,단어), ...] listg = [x[0] for x in list(g)] group.append((k, listg)) #print("Iter Group :",group) for word in group: #복합명사 추출 if (word[0] == 'NN' and len(word[1]) >= 5): keyword.append(word[1]) for index in range(len(group) - 2): #명사+의/와/과+명사 추출 , 명사+관형사형 접미사+명사 if (group[index][0] == 'NN' and group[index + 2][0] == 'NN'): if (group[index + 1][1] in ['적', '화', '의', '와', '과']): keyword.append(group[index][1] + group[index + 1][1] + group[index + 2][1]) for index in range(len(group) - 3): #명사+감성 형용사+명사 , 명사+용언형 접미사+명사 if (group[index][0] == 'NN' and group[index + 3][0] == 'NN'): if (group[index + 1][0] in ['VA', 'XSV', 'XSA']): keyword.append(group[index][1] + group[index + 1][1] + group[index + 2][1] + group[index + 3][1]) for index in range(len(group) - 2): #감성 형용사+명사 if (group[index][0] == 'VA' and group[index + 1][0] == 'ETM' and group[index + 2][0] == 'NN'): keyword.append(group[index][1] + group[index + 1][1] + group[index + 2][1]) for index, word in enumerate(keyword): #키워드 합치기 if type(word) == list: merge = '' for i in word: merge += i keyword[index] = merge del_list = list() append_list = list() for word in keyword: #키워드 거르기 if word[-1] in ['것', '수']: del_list.append(word) elif word[3:5] == '의원': del_list.append(word) append_list.append(word[5:]) elif '.' in word or '․' in word or '․' in word: del_list.append(word) for word in del_list: keyword.remove(word) for word in append_list: keyword.append(word) del_list = list() for word in keyword: if len(word) < 5: del_list.append(word) for word in del_list: keyword.remove(word) return keyword
class NewsProcessor: def __init__(self, news_path='/home/ir1067/FOR_TITLE/Title_33_2014_all', fin_path='/home/ir1067/price_w_indicator', kospi_data_path="/home/ir1067/data/kospi.csv"): # path self.fin_path = fin_path # .xlsx file -> news text data self.xlsx_list = [ name for name in os.listdir(news_path) if ('.xlsx' in name) & ('#' not in name) ] self.xlsx_list.sort() # company names self.company_list = set([file[:-5] for file in self.xlsx_list]) # .csv file -> financial data [open, close, price] self.csv_list = [ name for name in os.listdir(fin_path) if ('.csv' in name) & ('#' not in name) ] self.csv_list.sort() # kospi market data [open, close] self.kospi = pd.read_csv(kospi_data_path).set_index('date') self.kospi.index = pd.to_datetime(self.kospi.index) self.kospi = self.kospi[['open', 'close']] self.kospi['open'] = [ re.sub(',', '', text) for text in self.kospi['open'] ] self.kospi['close'] = [ re.sub(',', '', text) for text in self.kospi['close'] ] self.kospi.open = self.kospi.open.astype(float) self.kospi.close = self.kospi.open.astype(float) # Khaiii API self.khaiii = KhaiiiApi() # print info print("Data file infomation") print("- News data (xlsx):\t{}".format(len(self.xlsx_list))) print("- Price data (csv):\t{}".format(len(self.csv_list))) print("- Company count:\t{}".format(len(self.company_list))) print("NewsProcessor init complete.") def get_xlsx(self, company_name): output = pd.DataFrame() # xlsx files containing company name data_list = [ filename for filename in self.xlsx_list if company_name in filename ] for filename in data_list: news = pd.read_excel(filename, index_col=0) output = pd.concat([output, news]) output.reset_index(inplace=True, drop=True) print("Data NaN infomation") for col in output.columns: output[col] = [ text if text != "" else np.nan for text in output[col] ] print(output.isna().sum()) output['date'] = pd.to_datetime(output['date'], format="%Y.%m.%d") output.set_index('date', drop=True, inplace=True) return output def get_csv(self, company_name): output = pd.DataFrame() # csv files containing company name data_list = [ filename for filename in self.csv_list if company_name in filename ] for filename in data_list: price = pd.read_csv(self.fin_path + filename, index_col=0) output = pd.concat([output, price]) output.index = pd.to_datetime(output.index, format="%Y.%m.%d") return output def clean_text(self, data): # ⓒ~ , 저작권자~, 기자~ 삭제 고려해보기 data['title'] = [re.sub('\[.+?\]', '', text, 0, re.I | re.S).strip() \ for text in data['title']] data['title'] = [ text if text != '' else np.nan for text in data['title'] ] data['contents'] = [text.replace("// flash 오류를 우회하기 위한 함수 추가\nfunction _flash_removeCallback() {}", "") \ for text in data['contents']] data['contents'] = [re.sub('\(.+?\)', '', text, 0, re.I | re.S).strip() \ for text in data['contents']] data['contents'] = [re.sub('{.+?}', '', text, 0, re.I | re.S).strip() \ for text in data['contents']] data['contents'] = [re.sub('\[.+?\]', '', text, 0, re.I | re.S).strip() \ for text in data['contents']] data['contents'] = [re.sub('<.+?>', '', text, 0, re.I | re.S).strip() \ for text in data['contents']] data['contents'] = [re.sub('<.+?>', '', text, 0, re.I | re.S).strip() \ for text in data['contents']] # ▶ 이걸로 시작하는 기사가 하나 있음 #data['contents'] = [re.sub('▶.*', '', text, 0, re.I | re.S).strip().replace(",", "") \ # for text in data['contents']] print("Check NaN") print(data.isna().sum()) def drop_empty(self, data): print("Data length before drop: ", len(data)) data.dropna(inplace=True, how='any') data.drop(data[data['contents'] == ''].index, inplace=True) data = data.reset_index(drop=True) #index = [news.index for news in data['contents'] if news == ''] #for i in index: # data.drop([data.index[i]], inplace= True) print("Data length after drop: ", len(data)) def tokenizing(self, data, tag): if type(tag) == list: try: print("Start Khaiii analyze") after_analyze = [ self.khaiii.analyze(news) for news in data['contents'] if news != '' ] print("Done") tokenized = [[morph.lex for chunk in news for morph in chunk.morphs \ if morph.tag in tag] for news in after_analyze] tokenized = [ text if text != [] else np.nan for text in tokenized ] is_empty = [1 if text == np.nan else 0 for text in tokenized] print("Empty list after tokenizing: {}".format(sum(is_empty))) return tokenized except KeyError as e: print(e, "DataFrame does not have 'contents' column") else: print("Error: parameter 'tag' must be list") def tokenizing_title(self, data, tag): if type(tag) == list: try: print("Start Khaiii analyze") after_analyze = [ self.khaiii.analyze(news) for news in data['title'] if news != '' ] print("Done") tokenized_title = [[morph.lex for chunk in news for morph in chunk.morphs \ if morph.tag in tag] for news in after_analyze] tokenized_title = [ text if text != [] else np.nan for text in tokenized_title ] is_empty = [ 1 if text == np.nan else 0 for text in tokenized_title ] print("Empty list after tokenizing: {}".format(sum(is_empty))) return tokenized_title except KeyError as e: print(e, "DataFrame does not have 'contents' column") else: print("Error: parameter 'tag' must be list") def labeling(self, data, kospi, days): data.index = pd.to_datetime(data.index) kospi.columns = ['k_open', 'k_close'] data = pd.merge(data, kospi, how='right', left_index=True, right_index=True) data = data.dropna(how='any') for day in days: if day == 1: open = data['open'] close = data['close'] rtn = close / open - 1 mkt_rtn = data['k_close'] / data['k_open'] - 1 data['label'] = (rtn > mkt_rtn).astype(int).shift(-1) else: price = data['adj_close'] rtn = price.pct_change(day).shift(-day - 1) mkt_rtn = data['k_close'].pct_change(day).shift(-day - 1) data['label%d' % day] = (rtn > mkt_rtn).astype(int) data.drop(['k_open', 'k_close'], inplace=True, axis=1) indicators = data[data.columns[:]] return indicators def to_datetime(self, unified_file): for i in range(len(unified_file.date)): if i % 100 == 0: print('processing', i) if len(unified_file.date[i]) == 19: unified_file.date[i] = unified_file.date[ i][:15] + '0' + unified_file.date[i][15:] if bool(re.search('오후', unified_file.date[i])) & bool( unified_file.date[i][15:17] != '12') == True: unified_file.date[i] = unified_file.date[i][:15] + '{}'.format(int(unified_file.date[i][15:17]) + 12) + \ unified_file.date[i][17:] unified_file.date[ i] = unified_file.date[i][:12] + unified_file.date[i][15:] unified_file.date = pd.to_datetime(unified_file.date) unified_file.set_index(['date'], inplace=True)
sentence = u'내년도 최저임금을 기존 방식대로 전체 업종에 동일하게 적용하기로 결정했다.\ 최저임금의 업종별 차등 적용을 요구해온 사용자위원들은 이에 반발해 전원회의에서 퇴장했다.\ 최저임금위원회 사용자위원들은 이날 오후 정부세종청사에서 열린 최저임금위원회 제5차 전원회의 도중 퇴장해 기자들과 만나 \ "금일 최저임금위원회는 최저임금 고시에 월 환산액을 병기하고 2020년 최저임금을 모든 업종에 동일하게 적용하기로 결정했다"고 밝혔다.' sentences = [sentence] * 10000 import time from konlpy.tag import Hannanum, Kkma, Komoran, Okt, Mecab from khaiii import KhaiiiApi api = KhaiiiApi() morphs_processors= [('Hannanum', Hannanum()), ('Kkma', Kkma()), ('Komoran', Komoran()), ('Okt', Okt()), ('mecab', Mecab())] for name, morphs_processor in morphs_processors: strat_time = time.time() morphs = [morphs_processor.pos(sentence) for sentence in sentences] elapsed_time = time.time() - strat_time print('morphs_processor name = %20s, %.5f secs' % (name, elapsed_time)) strat_time = time.time() morphs = [api.analyze(sentence) for sentence in sentences] elapsed_time = time.time() - strat_time print('morphs_processor name = %20s, %.5f secs' % ('khaiii', elapsed_time))
print(len(data['FReview'][1]), len(extract_corpus_mecab)) # Colab에 Khaiii 설치 !git clone https://github.com/kakao/khaiii.git !pip install cmake !mkdir build !cd build && cmake /content/khaiii !cd /content/build/ && make all !cd /content/build/ && make resource !cd /content/build && make install !cd /content/build && make package_python !pip install /content/build/package_python from khaiii import KhaiiiApi api = KhaiiiApi() n_tags = ['NNG', 'NNP', 'NNB', 'VV', "VA" ] ex = data['Review'][2] bad_ex = data['Review'][4] def extract_corpus_khaiii(texts): extract_corpus = [] for line in texts: if str(line) != 'nan': nouns = [] for word in api.analyze(str(line)): for morphs in word.morphs: if morphs.tag in n_tags: nouns.append(morphs.lex)
from typing import List from khaiii import KhaiiiApi _tokenizer = KhaiiiApi() def tokenize(본문, tagged=False) -> List: 형태분석 = [] if not 본문.strip(): return 형태분석 분석결과 = _tokenizer.analyze(본문) for 어절_형태분석 in 분석결과: for 요소 in 어절_형태분석.morphs: if tagged: 형태분석.append((요소.lex, 요소.tag)) else: 형태분석.append(요소.lex) return 형태분석
chosung = CHOSUNGS[chosung_index] joongsung = JOONGSUNGS[joongsung_index] jongsung = JONGSUNGS[jongsung_index] # 종성 범위 밖에 있는 것들은 end_char로 메꿔준다. if jongsung_index == 0: jongsung = end_char result.append(chosung) result.append(joongsung) result.append(jongsung) return "".join(result) # khaiii khaiii = KhaiiiApi() def khaiii_tokenize(text): tokens = [] for word in khaiii.analyze(text): tokens.extend([str(m).split('/')[0] for m in word.morphs]) return tokens # konlpy tokenizers mecab = Mecab().morphs okt = Okt().morphs komoran = Komoran().morphs hannanum = Hannanum().morphs # 오류 발생 kkma = Kkma().morphs def space_tokenizer(text): return text.split(' ')
def augment_data(args): # Option checking if args.no_analyzer: args.p_pos = 0. # disable replacement using POS tags. # Load original tsv file input_tsv = load_tsv(args.input, skip_header=False) if args.no_analyzer: sentences = [] for text, label in tqdm(input_tsv, desc='No POS tagging'): sentence = [] for token in text.split(): tag = 'word' word = Word(token, tag) sentence.append(word) sentences.append((sentence, label)) else: # POS tagging if args.analyzer == 'spacy': import spacy from spacy.symbols import ORTH spacy_en = spacy.load('en_core_web_sm') spacy_en.tokenizer.add_special_case(args.mask_token, [{ORTH: args.mask_token}]) sentences = [(spacy_en(text), label) for text, label in tqdm(input_tsv, desc='POS tagging')] if args.analyzer == 'khaiii': from khaiii import KhaiiiApi khaiii_api = KhaiiiApi() sentences = [] for text, label in tqdm(input_tsv, desc='POS tagging'): sentence = [] khaiii_sentence = khaiii_api.analyze(text) for khaiii_word in khaiii_sentence: for khaiii_morph in khaiii_word.morphs: morph = khaiii_morph.lex tag = khaiii_morph.tag # we might need to modify 'morph' for matching the vocab of GloVe. # ex) if tag in ['VV', 'VA', 'VX', 'XSV', 'XSA', 'VCP']: morph += u'다' word = Word(morph, tag) sentence.append(word) sentences.append((sentence, label)) if args.analyzer == 'npc': sys.path.append('data/clova_sentiments_morph/npc-install/lib') import libpnpc as pnpc res_path = 'data/clova_sentiments_morph/npc-install/res' npc = pnpc.Index() npc.init(res_path) sentences = [] for text, label in tqdm(input_tsv, desc='POS tagging'): sentence = [] npc_sentence = npc.analyze(text) for item in npc_sentence: meta = item['meta'] if meta != '[NOR]': continue morph = item['morph'] tag = item['mtag'] word = Word(morph, tag) sentence.append(word) sentences.append((sentence, label)) if args.no_augment: # Write to file with open(args.output, 'w') as f: for sentence, label in tqdm(sentences, desc='Writing'): s = [] for word in sentence: s.append(word.text) if args.preserve_label: out_label = label else: out_label = args.dummy_label f.write("{}\t{}\n".format(' '.join(s), out_label)) sys.exit(0) # Build lists of words indexes by POS pos_dict = {} if args.no_analyzer else build_pos_dict(sentences, lower=args.lower) # Generate augmented samples if args.parallel: pool = mp.Pool(mp.cpu_count()) # processs in parallel entries = [] for sentence, label in tqdm(sentences, desc='Preparation data for multiprocessing'): entry = {'sentence': sentence, 'label': label, 'pos_dict': pos_dict, 'args': args} entries.append(entry) print('Data ready! go parallel!') sentences = pool.map(make_samples, entries, chunksize=100) sentences = reduce(lambda x,y: x+y, sentences) pool.close() pool.join() print('Done!') else: # process sequentially augmented = [] for sentence, label in tqdm(sentences, desc='Sampling'): entry = {'sentence': sentence, 'label': label, 'pos_dict': pos_dict, 'args': args} samples = make_samples(entry) augmented.extend(samples) sentences = augmented # Write to file with open(args.output, 'w') as f: for sentence, label in tqdm(sentences, desc='Writing'): if args.preserve_label: out_label = label else: out_label = args.dummy_label f.write("{}\t{}\n".format(' '.join(sentence), out_label))
class TitleMethod(Method): """ Title KNN Method class for playlist continuation task cold start problem. Title KNN Method. Attributes: name (str) : name of method playlist2idx (dict) : playlist to index dictionary. title2playlist (dict) : title to list of playlists dictionary. token2idx (dict) : NLP processed token to index dictionary. token2title (dict) : NLP processed token to list of titles dictionary. doc2vec_model (doc2vec) : Doc2Vec Model in gensim. tt_matrix (sparse matirx) : NLP processed token to tag matrix ts_matirx (sparse matrix) : NLP processed token to song matrix api (KhaiiApi) : Korean Tokenizer Return: """ def __init__(self, name): super().__init__(name) self.playlist2idx = dict() self.title2playlist = dict() self.token2idx = dict() self.token2title = dict() self.unique_token = set() self.doc2vec_model = None self.tt_matrix = None self.ts_matrix = None self.api = KhaiiiApi() def _tokenize_title(self, title): """ Tokenize playlist title. Tokenize playlist title using khaiii. Attributes: title (str) : playlist title Return: token (list) : list of "lexicon/tag" token """ token = list() try: words = self.api.analyze(title) except KhaiiiExcept: words = list() for word in words: for morph in word.morphs: if morph.tag[:2] in ['NN', 'VV', 'VA', 'VC', 'MM', 'XR' ] or morph.tag == 'MAG': token.append('/'.join([morph.lex, morph.tag])) return token def _prepare_data(self): """ Prepare necessary data structures for Title KNN Method. Prepare necessary data structures for Title KNN Method. """ ### tokenize using khaiii ### make csr matrix (token - tag | song) row = {'tag': list(), 'song': list()} col = {'tag': list(), 'song': list()} data = {'tag': list(), 'song': list()} token_id = 0 for title, playlist in self.title2playlist.items(): # check wheter this title is in train dataset (not validation or test dataset) has_train_playlist = False for p in playlist: playlist_id = self.playlist2idx[p] if playlist_id < self.n_train: has_train_playlist = True break if not has_train_playlist: continue token = self._tokenize_title(title) for t in token: if t in self.token2idx: token_id = self.token2idx[t] else: self.token2idx[t] = token_id for p in playlist: playlist_id = self.playlist2idx[p] if playlist_id < self.n_train: for item_id in self.pt_train[playlist_id].nonzero()[1]: row['tag'].append(token_id) col['tag'].append(item_id) data['tag'].append(1) for item_id in self.ps_train[playlist_id].nonzero()[1]: row['song'].append(token_id) col['song'].append(item_id) data['song'].append(1) token_id = len(self.token2idx) self.tt_matrix = csr_matrix((data['tag'], (row['tag'], col['tag'])), dtype=float) self.ts_matrix = csr_matrix((data['song'], (row['song'], col['song'])), dtype=float) _, self.tt_matrix = transform_idf(self.tt_matrix) _, self.ts_matrix = transform_idf(self.ts_matrix) def _rate(self, pid, mode): """ Make ratings. Rate on items(tag/song) based on test data, which index is pid. Args: pid (int) : playlist id in test data mode (str) : determine which item. tags or songs Return: rating(numpy array): playlist and [tags or songs] rating """ assert mode in ['tags', 'songs'] title_matrix = self.tt_matrix if mode == 'tags' else self.ts_matrix n = self.n_tag if mode == 'tags' else self.n_song idx2playlist = { idx: playlist for playlist, idx in self.playlist2idx.items() } playlist2title = dict() for title, playlists in self.title2playlist.items(): for playlist in playlists: playlist2title[playlist] = title rating = np.zeros(n) playlist = idx2playlist[pid + self.n_train] title = playlist2title[playlist] token = self._tokenize_title(title) token = [t for t in token if t in self.token2idx.keys()] token_ids = [self.token2idx[t] for t in token] if len(token_ids) == 0: return rating rating = np.sum(title_matrix[token_ids, :].toarray(), axis=0).reshape(-1) return rating def initialize(self, n_train, n_test, pt_train, ps_train, pt_test, ps_test, transformer_tag, transformer_song, checkpoint_dir='./checkpoints'): """ initialize necessary variables for Method. initialize necessary data structure. Args: n_train (int) : number of playlist in train dataset. n_test (int) : number of playlist in test dataset. pt_train (csr_matrix) : playlist to tag sparse matrix made from train dataset. ps_train (csr_matrix) : playlist to tag sparse matrix made from train dataset. pt_test (csr_matrix) : playlist to tag sparse matrix made from test dataset. ps_test (csr_matrix) : playlist to song sparse matrix made from test dataset. transformer_tag (TfidfTransformer) : scikit-learn TfidfTransformer model fitting pt_train. transformer_song (TfidfTransformer) : scikit-learn TfidfTransformer model fitting ps_train. checkpoint_dir (str) : where to save similarity matrix. Return: """ super().initialize(n_train, n_test, pt_train, ps_train, pt_test, ps_test, transformer_tag, transformer_song) ### tokenize using khaiii ### make csr matrix (token - tag | song) row = {'tag': list(), 'song': list()} col = {'tag': list(), 'song': list()} data = {'tag': list(), 'song': list()} token_id = 0 for title, playlist in self.title2playlist.items(): # check wheter this title is in train dataset (not validation or test dataset) has_train_playlist = False for p in playlist: playlist_id = self.playlist2idx[p] if playlist_id < self.n_train: has_train_playlist = True break if not has_train_playlist: continue token = self._tokenize_title(title) for t in token: if t in self.token2idx: token_id = self.token2idx[t] else: self.token2idx[t] = token_id for p in playlist: playlist_id = self.playlist2idx[p] if playlist_id < self.n_train: for item_id in self.pt_train[playlist_id].nonzero()[1]: row['tag'].append(token_id) col['tag'].append(item_id) data['tag'].append(1) for item_id in self.ps_train[playlist_id].nonzero()[1]: row['song'].append(token_id) col['song'].append(item_id) data['song'].append(1) token_id = len(self.token2idx) self.tt_matrix = csr_matrix((data['tag'], (row['tag'], col['tag'])), dtype=float) self.ts_matrix = csr_matrix((data['song'], (row['song'], col['song'])), dtype=float) _, self.tt_matrix = transform_idf(self.tt_matrix) _, self.ts_matrix = transform_idf(self.ts_matrix) def predict(self, pid, mode): """ Make ratings based on mode. rate the playlist, which index in test sparse matrix is pid based on mode. Args: pid (int) : playlist id in test sparse matrix mode (str) : tags or songs Return: rating (ndarray) : playlist id and rating """ rating = self._rate(pid, mode=mode) return rating
# 공백제거 for i in range(len(playlist)): playlist[i] = playlist[i].strip() # # In[7]: # # words = ['발라드','캐럴','케롤','스타워즈','뉴에이지','게임','프로필', # # '마쉬멜로','유산슬','감성발라드','일렉트로닉','섹시','록메탈','힐링'] # # MD.fn_add_khaiidic(words) # # In[4]: from khaiii import KhaiiiApi from pprint import pprint api = KhaiiiApi(rsc_dir='khaiii/build/share/khaiii') # 내 설치 경로 from multiprocessing import Pool import time def fn_analyze_khaiii(i): tmp_list = [] if len(playlist[i]) == playlist[i].count(' '): # 빈칸으로만 되어있는 행 pass elif playlist[i].find(" ") == -1: # 띄어쓰기 없는 행 tmp_list.append(playlist[i]) else:
for row in test_data: if not row[1]: test_data.remove(row) # In[8]: from khaiii import KhaiiiApi # In[9]: api = KhaiiiApi() # In[10]: train_pos = [] for row in train_data: sent_pos = [] sentence = row[1] for word in api.analyze(sentence): pos = str(word).split('\t')[1] sent_pos.append(pos) train_pos.append(sent_pos) train_pos[:5]
from selenium import webdriver from bs4 import BeautifulSoup from khaiii import KhaiiiApi from tf_idf import Tf_idf import parsing import contents_print import client from urllib.parse import urljoin import re api = KhaiiiApi('./khaiii/khaiii/build/lib/libkhaiii.0.4.dylib', './khaiii/khaiii/build/share/khaiii') db = client.ClientDb() max_depth = 3 url = 'https://www.mma.go.kr/' filter_domain = 'http://www.mma.go.kr/' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome('./chrome/chromedriver_linux64/chromedriver', chrome_options=chrome_options) driver.implicitly_wait(3) driver.set_page_load_timeout(100) visited_pages = set() keywords = [] df_dict = {} # total word df homepages = [] # total pages visited
final.append(temp) import pandas as pd final = pd.DataFrame(final) final.columns = ['#'] final.to_csv('/content/khaiii/rsc/src/preanal.manual', encoding='utf-8', index=False, header=False) # Khaiii 사용자 사전 추가 !cd /content/khaiii/rsc !mkdir -p /content/build/share/khaiii !PYTHONPATH=/content/khaiii/src/main/python /content/khaiii/rsc/bin/compile_preanal.py --rsc-src=/content/khaiii/rsc/src --rsc-dir=/content/build/share/khaiii from khaiii import KhaiiiApi api = KhaiiiApi(rsc_dir="/content/build/share/khaiii") for word in api.analyze('얼그레이가 맛있습니다.'): for morphs in word.morphs: print(morphs) data = pd.read_csv('/content/final_preprocessed_data.csv', encoding='utf-8', index_col=0) data.head() data.info() """## Experiment 1) Khaiii에서 네이버 플레이스 리뷰 명사만 추출해 Topic Modeling""" from khaiii import KhaiiiApi api = KhaiiiApi(rsc_dir="/content/build/share/khaiii")
def khaiii(): api = KhaiiiApi() words = __name__ for word in api.analyze('안녕, 세상.'): words += ", " + word return words
from khaiii import KhaiiiApi api = KhaiiiApi() import time print( '한글 문장 입력: ') input_origin = str(input()) # 출력할 문장 수 n = 10 #timer start start = time.time() f = open('KCC150_K01_utf8.txt') # KCC150_K01_utf8 word = [] #형태소 list sentence_morph_list = [] #각 문장을 형태소 분석한 결과, 출현 횟수 sentence_cnt_of_morph_appear = [] #각 형태소가 출현한 문서 수 # % 출력 total_lines = 1000000 line_cnt = 0 percent = 0 #입력 문장 input_morph = dict() for morph_list in api.analyze(input_origin): for m in morph_list.morphs: if(m.lex not in word): word.append(m.lex)
from khaiii import KhaiiiApi import io api = KhaiiiApi() t = io.open('pre.txt', mode='r', encoding='utf-8') #x = io.open('2016-10-20.index_new',mode='r', encoding='utf-8') #nt = io.open('top5_txt',mode='w') fin = ["EF", "SF", "EC", "ㅋ"] label = ["NNG", "W", "MAG", "VA", "NNP"] cnt = 0 while True: text = t.readline() total = "" if text == "\n": print() continue if "FINISH" in text: break print(cnt) for word in api.analyze(text): tmp = str(word) print(tmp) if any(format in tmp for format in label): chk = 0 if any(format in tmp for format in fin): chk = 1 ttmp = tmp.split() if chk == 1: ttmp[0] = ttmp[0] + "." total = total + " " + ttmp[0]
제시는 요즘 힙합트랩 비트와 알앤비가 마구 섞이는 트렌드에 딱 맞는 재목인데 예능에서만 소비되는게 아까움 톤이면 톤, 스킬이면 스킬 외국에서도 보기드문 보컬인데.. 제시제이랑 좀 비슷한거같으면서 더 찐득한 실제로 본인말로는 외국 유명 프로듀서들이 러브콜해서 진출직전이었는데 코로나사태 이후 중지되었다고..""" parser = argparse.ArgumentParser(description='토크나이저를 테스트할 문장을 입력하세요.') parser.add_argument('--text', required=False, default=default_text) args = parser.parse_args() tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") khaiii = KhaiiiApi() mecab = Mecab() okt = Okt() komoran = Komoran() hannanum = Hannanum() kkma = Kkma() text = args.text print("-"*5,"원본 텍스트", "-"*5) print(text) print("-"*5, "Mecab", "-"*5) print(mecab.morphs(text)) print("-"*5, "Okt", "-"*5)
"""## Gensim Topic Modeling""" import pandas as pd data = pd.read_csv('/content/final_input.csv', encoding='utf-8').drop(['Unnamed: 0'], axis=1) data.info() """# Gensim LDA를 위한 데이터 전처리 ## Experiment 1) Khaiii에서 명사, 어근만 추출해 Tokenizing """ from khaiii import KhaiiiApi api = KhaiiiApi(rsc_dir="/content/build/share/khaiii") n_tags = ['NNG', 'NNP', 'NNB', 'XR']#, 'VV', "VA"] # 동사도 넣고 싶으면 추가 ''' input : 추출할 Review의 list ; output : n_tags의 tag와 일치하는 text list ; ''' def extract_corpus_khaiii(texts): extract_corpus = [] for line in texts: if str(line) != 'nan': nouns = [] for word in api.analyze(str(line)):
import pickle import json import os import re import random import numpy as np from numpy.linalg import norm import gensim import multiprocessing import pandas as pd from gensim.models import Word2Vec from tqdm import tqdm from khaiii import KhaiiiApi api = KhaiiiApi() flatten = lambda l: [item for sublist in l for item in sublist] def make_unique_dict(tag_list): flatten_tags = flatten(tag_list) tag_counter = Counter(flatten_tags).most_common() df_tag_counter = pd.DataFrame(tag_counter, columns=['tags', 'freq']) delete_list = ['노래', '음악', '플레이리스트'] retreval_tags = [] merge_list = [] for i in df_tag_counter['tags'][:673]: if i in delete_list: pass else: retreval_tags.append(i)
def extract_tag_from_song_title(): MODE = "Test" if MODE == "Valid": train = pd.concat([ pd.read_json("arena_data/orig/train.json"), pd.read_json("arena_data/questions/val.json") ], ignore_index=True) else: train = pd.read_json("res/train.json") dev = pd.read_json("res/val.json") test = pd.read_json("res/test.json") def re_sub(series: pd.Series) -> pd.Series: series = series.str.replace(pat=r'[ㄱ-ㅎ]', repl=r'', regex=True) # ㅋ 제거용 series = series.str.replace(pat=r'[^\w\s]', repl=r'', regex=True) # 특수문자 제거 series = series.str.replace(pat=r'[ ]{2,}', repl=r' ', regex=True) # 공백 제거 series = series.str.replace(pat=r'[\u3000]+', repl=r'', regex=True) # u3000 제거 return series def flatten(list_of_list: List) -> List: flatten = [j for i in list_of_list for j in i] return flatten def get_token(title: str, tokenizer) -> List[Tuple]: if len(title) == 0 or title == ' ': # 제목이 공백인 경우 tokenizer에러 발생 return [] result = tokenizer.analyze(title) result = [(morph.lex, morph.tag) for split in result for morph in split.morphs] # (형태소, 품사) 튜플의 리스트 return result def get_all_tags(df) -> List: tag_list = df['tags'].values.tolist() tag_list = flatten(tag_list) return tag_list tokenizer = KhaiiiApi() all_tag = get_all_tags(pd.concat([train, dev, test])) token_tag = [get_token(x, tokenizer) for x in all_tag] # 태그를 형태소 분석 token_itself = list(filter(lambda x: len(x) == 1, token_tag)) # 태그 자체가 형태소여서 분리되지 않는 태그만 골라봅니다 token_itself = flatten(token_itself) flatten_token = flatten(token_tag) print('%-23s' % '# of original tag is', f'{len(all_tag):8,}') print('%-23s' % '# of morpheme itself is', f'{len(token_itself):8,}') print('%-23s' % '# of total token is', f'{len(flatten_token):8,}') train['plylst_title'] = re_sub(train['plylst_title']) train.loc[:, 'ply_token'] = train['plylst_title'].map( lambda x: get_token(x, tokenizer)) # tag 분류표는 https://github.com/kakao/khaiii/wiki/%EC%BD%94%ED%8D%BC%EC%8A%A4 를 참고 using_pos = ['NNG', 'SL', 'NNP', 'MAG', 'SN'] # 일반 명사, 외국어, 고유 명사, 일반 부사, 숫자 train['ply_token'] = train['ply_token'].map( lambda x: list(filter(lambda x: x[1] in using_pos, x))) unique_tag = set(token_itself) unique_word = [x[0] for x in unique_tag] # 우리의 목적은 정답 tags를 맞추는 것이기 때문에 정답 tags에 나온 형태소만 남겨둡니다. train['ply_token'] = train['ply_token'].map( lambda x: list(filter(lambda x: x[0] in unique_word, x))) train['predict_tag'] = train['ply_token'].map( lambda x: [tag[0] for tag in x]) train['predict_tag'] = train.apply( lambda x: [tag for tag in x.predict_tag if tag not in x.tags], axis=1) # 이미 정답에 있는 건 제외 dev['plylst_title'] = re_sub(dev['plylst_title']) dev.loc[:, 'ply_token'] = dev['plylst_title'].map( lambda x: get_token(x, tokenizer)) dev['ply_token'] = dev['ply_token'].map( lambda x: list(filter(lambda x: x[1] in using_pos, x))) dev['ply_token'] = dev['ply_token'].map( lambda x: list(filter(lambda x: x[0] in unique_word, x))) dev['predict_tag'] = dev['ply_token'].map(lambda x: [tag[0] for tag in x]) dev['predict_tag'] = dev.apply( lambda x: [tag for tag in x.predict_tag if tag not in x.tags], axis=1) # 이미 정답에 있는 건 제외 test['plylst_title'] = re_sub(test['plylst_title']) test.loc[:, 'ply_token'] = test['plylst_title'].map( lambda x: get_token(x, tokenizer)) test['ply_token'] = test['ply_token'].map( lambda x: list(filter(lambda x: x[1] in using_pos, x))) test['ply_token'] = test['ply_token'].map( lambda x: list(filter(lambda x: x[0] in unique_word, x))) test['predict_tag'] = test['ply_token'].map( lambda x: [tag[0] for tag in x]) test['predict_tag'] = test.apply( lambda x: [tag for tag in x.predict_tag if tag not in x.tags], axis=1) # 이미 정답에 있는 건 제외 final = [] final_dict = dev[['id', 'predict_tag']].to_dict('index') final += [i for i in final_dict.values()] final_dict = test[['id', 'predict_tag']].to_dict('index') final += [i for i in final_dict.values()] final_dict = train[['id', 'predict_tag']].to_dict('index') final += [i for i in final_dict.values()] distutils.dir_util.mkpath("./arena_data/model") with open('arena_data/model/pred_tag.json', 'w', encoding='utf-8') as f: f.write(json.dumps(final, ensure_ascii=False))