def infl_modify_REG(word, pos, inf):
    reg1_list = ['ㅑ','ㅒ','ㅖ','ㅘ','ㅛ','ㅙ','ㅚ','ㅝ','ㅠ','ㅡ','ㅢ','ㅣ']
    reg2_list = ['ㅗ','ㅜ']
    reg3_list = ['ㅏ','ㅓ','ㅕ','ㅐ','ㅔ','ㅞ']
    
    if inf == 'VERB-REG' or inf == 'ADJ-REG':
        word = split_syllables(word)
        list_word = list(word)
        
        if list_word[-1] in reg1_list:
            word = "".join(list_word)
            word = join_jamos(word)
            inf = inf + '1'
        elif list_word[-1] in reg2_list:
            word = "".join(list_word)
            word = join_jamos(word)
            inf = inf + '2'            
        elif list_word[-1] in reg3_list:
            word = "".join(list_word)
            word = join_jamos(word)
            inf = inf + '3'              
        else:
            word = "".join(list_word)
            word = join_jamos(word)            
            return word, pos, inf # -REG0이므로, 그냥 패스
        return word, pos, inf # 수정된 내용이 반영된 정보 return
    else:
        return word, pos, inf # -REG가 아니므로 (modify할 필요x), 그냥 패스
Exemple #2
0
def split_raw(raw_data, text_modeling):
    if text_modeling == 'chr':
        data = []
        for spl_data in raw_data.split():
            data.append(u' ')
            for c in spl_data:
                # only takes . and , and korean
                # 0x3131 ~ 0x3163: single jamos, 0xac00 ~ 0xd7a3: full character
                if (ord(c) in ['.', ',']) or (0xac00 <= ord(c) <= 0xd7a3):
                    data.extend(split_syllables(c))
                elif 0x3131 <= ord(c) <= 0x3163:
                    data.append(c)
    elif text_modeling == 'syl':
        data = []
        for spl_data in raw_data.split():
            data.append(u' ')
            for c in spl_data:
                # only takes . and , and full korean syllable
                # 0x3131 ~ 0x3163: jamos, 0xac00 ~ 0xd7a3: full character
                if (ord(c) in ['.', ',']) or (0xac00 <= ord(c) <= 0xd7a3):
                    data.append(c)
    else:
        print 'Invalid text modeling'
    if len(data) == 0:
        data = [u' ']
    return data
 def search(self, trie_list, word):
     word = split_syllables(word)
     if(len(word)==1):
         return trie_list[self.index(word)][0] # info를 출력할 땐, 왼쪽 None을 출력한다.
     else:
         f_word = word[0]
         word = word[1:]
         char_idx = self.index(f_word) 
         return self.search(trie_list[char_idx][1], word) # Search를 찾아나설 땐, 오른쪽 None에 저장된 연결된 노드들을 따라 나선다.
def KoreanToEnglish(data):
    data = split_syllables(data)
    finaldata = ""
    for i in data:
        try:
            finaldata += KoreanToEnglishchange[i]
        except KeyError:
            finaldata += i
    return finaldata
    def parse_transcript(self, transcript_path):
        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
#         with open(transcript_path, 'r', encoding='utf-16') as transcript_file:
            transcript = transcript_file.read().replace('\n', '')
    
        if self.use_jamo:
            transcript = split_syllables(transcript)
    
        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
        return transcript
def handleKeyRelease(key):
    if store == check1:
        print("program")

        pyautogui.keyDown('shift')
        pyautogui.keyDown('home')
        time.sleep(0.1)
        pyautogui.keyUp('shift')
        pyautogui.keyUp('home')

        pyautogui.keyDown('ctrl')
        pyautogui.keyDown('c')
        time.sleep(0.1)
        pyautogui.keyUp('ctrl')
        pyautogui.keyUp('c')
        time.sleep(0.2)

        pyautogui.hotkey('backspace')

        data = pyperclip.paste()
        data = split_syllables(data)
        finaldata = ""
        for i in data:
            print("test", isEnglishOrKorean(i))
            if isEnglishOrKorean(i) == 'k':
                finaldata += KoreanToEnglish(i)
                print("check", KoreanToEnglish(i))
            elif isEnglishOrKorean(i) == 'e':
                finaldata += EnglishToKorean(i)
            print(KoreanToEnglish(i), " | ", finaldata, " | ", i)
        finaldata = join_jamos(overlap1(finaldata))
        print(finaldata)

        pyperclip.copy(finaldata[:len(finaldata) - 1])
        pyautogui.hotkey("ctrl", "v")

        store.remove(Key.shift_r)
        store.remove(Key.home)

    if store2 == check2:
        store2.remove(Key.shift)
        store2.remove(Key.esc)
        return False

    if key in store:
        store.remove(key)

    if key in store2:
        store2.remove(key)
 def insert(self, word, info_list):
     word = split_syllables(word)
     if(len(word)==1):
         self.set_info(word, info_list)
     else:
         f_word = word[0]
         word = word[1:]
         char_idx = self.index(f_word)
         
         # [1]인 이유: 다음 ch와 연결되는 노드는 오른쪽 None에 저장한다.
         if (self[char_idx][1] == None):
             self[char_idx][1] = Trie()
             self[char_idx][1].insert(word,info_list)
         else:
             self[char_idx][1].insert(word,info_list)
def insert_dic_to_trie(ent_or_fun, trie_class):

	workbook = xlrd.open_workbook('assets/voca_set.xlsx')

	if ent_or_fun == 'fun': # 기능어 사전
		
		worksheet2 = workbook.sheet_by_name('기능어사전')
		num_rows = worksheet2.nrows #줄 수 가져오기
		num_cols = worksheet2.ncols #칸 수 가져오기

		# row_val = worksheet_name.row_values(row_index) #줄 값 가져오기(list형식)
		# cell_val = worksheet_name.cell_value(row_index,cell_index)

		for i in range(1, num_rows):
			word = worksheet2.cell_value(i, 0)
			pos = worksheet2.cell_value(i, 1)
			inf = worksheet2.cell_value(i, 2)
			anly = worksheet2.cell_value(i, 3)
			_info = pos+' '+inf+' '+anly
			decomposed_w = split_syllables(word)
			reversed_w = decomposed_w[::-1] # reverse하기 위해 decomposition 미리 실시
			trie_class.insert(reversed_w, _info) # 이미 decomposition되어 있지만 상관없음
		
	
	else: # 엔트리 사전
	
		worksheet1 = workbook.sheet_by_name('엔트리사전')
		num_rows = worksheet1.nrows #줄 수 가져오기
		num_cols = worksheet1.ncols #칸 수 가져오기

		# row_val = worksheet_name.row_values(row_index) #줄 값 가져오기(list형식)
		# cell_val = worksheet_name.cell_value(row_index,cell_index)

		for i in range(1, num_rows):
			word = worksheet1.cell_value(i, 0)
			pos = worksheet1.cell_value(i, 1)
			inf = worksheet1.cell_value(i, 2)
			word, pos, inf = infl_modify_REG(word, pos, inf)
			
			anly = word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie_class.insert(word, _info) # word는 insert함수에 의해 decomposition된다.

	return trie_class
Exemple #9
0
def text_to_phoneme(text, save_dir):
    logger = set_logger('phoneme-process')
    sp_text = []
    hangul = re.compile('[^\u3131-\u3163\uac00-\ud7a3]+')
    for split in text:
        review = hangul.sub(' ', split_syllables(split))
        if len(review) != 0:
            sp_text.append(review)

    unq_phoneme = []
    logger.info('Set Dictionary.')

    for line in tqdm(sp_text):
        for phoneme in line:
            if phoneme not in unq_phoneme:
                unq_phoneme.append(phoneme)

    logger.info('# of unique Phoneme : {}\nexample : {}'.format(len(unq_phoneme), unq_phoneme[:50]))

    phoneme_label = {ch: i + 1 for i, ch in enumerate(unq_phoneme)}
    label_phoneme = {i + 1: ch for i, ch in enumerate(unq_phoneme)}

    x = np.asarray([[phoneme_label[w] for w in sent if w in phoneme_label.keys()] for sent in sp_text])
    y_neg = [[1, 0] for _ in range(45000)]
    y_pos = [[0, 1] for _ in range(45000)]
    y = np.asarray(y_neg + y_pos)

    np.random.seed(618);
    np.random.shuffle(x)
    np.random.seed(618);
    np.random.shuffle(y)

    # Check Folder
    folder_check(dir_path=save_dir, dir_name='npz')
    folder_check(dir_path=save_dir, dir_name='dictionary')

    # Save Array & Dictionary
    save_npz(npz_path=save_dir + '/npz', npz_name='x_phoneme.npz', arr=x)
    save_npz(npz_path=save_dir + '/npz', npz_name='y_phoneme.npz', arr=y)
    save_pkl(pkl_path=save_dir + '/dictionary', pkl_name='dictionary_phoneme.pkl', save_object=label_phoneme)

    return None
	def extract(self, target_word):
		""" 1 ~ 6 : Segmenting using 최장일치 """
		""" 7 ~ : POS Ambiguity 해결 using 접속정보 """

		########################################
		""" 1. 모든 경우의 entry-analysis 저장 """
		########################################
		# 모음,자음단위로 나눠서(decomposition하여) 모든 경우의 조합들을 사전 lookup을 통해 뽑아낸다.
		### decomposition
		target_word = split_syllables(target_word)
		len_target_word = len(target_word)
		### lookup all entries
		# 원래 엔트리가 초성, 중성, 종성이 모두 있는데 초성, 중성까지만 있는 entry는 삭제 
		# 즉, 해당 엔트리 바로 오른쪽에 자음이 두 번 등장하는 경우 삭제 
		# (e.g. '신고전주의성진을'를 형태소 분석할 때, '시', '고저' 엔트리들은 삭제, 고려해볼 필요도 없다.)
		collect_entry_anly = [] # 모든 경우의 entry 정보가 들어감.
		#print(len_target_word)

		""" 순방향 스캔 for 엔트리-사전 """
		for i, _ in enumerate(target_word):
			for j, _ in enumerate(target_word[i:]):
				#print(target_word[i:][:j+1], i, j+i)
				if not search(self.trie_ent, target_word[i:][:j+1]) == 'No such word in Trie':
					entry = join_jamos(target_word[i:][:j+1])
					anly = search(self.trie_ent, target_word[i:][:j+1])
					collect_entry_anly.append([(entry)]+[i]+[j+i]+['ENT']+anly)     

		""" 역방향 스캔 for 기능어-사전 """
		toggle_fun = False
		#one_ch_fun = ['ㄴ','ㄹ','ㄻ','ㅁ','ㅆ']
		target_word = target_word[::-1]
		for i, _ in enumerate(target_word):
			for j, _ in enumerate(target_word[i:]):
				#print(target_word[i:][:j+1], len_target_word-1-i, len_target_word-1-j-i)
				if not search(self.trie_fun, target_word[i:][:j+1]) == 'No such word in Trie':

					#if i==0: # 기능어는 한 개만 (뒤에서부터 검색) 찾아서 넣어준다. 하지만, 기존 기능어를 포함하는 더 큰 범위의 기능어가 있으면 또 추가한다. (즉, i=0이면..) 단, 포함하지 않으면 추가하지 않는다.
					entry = join_jamos(target_word[i:][:j+1][::-1])
					anly = search(self.trie_fun, target_word[i:][:j+1])
					collect_entry_anly.append([(entry)]+[len_target_word-1-j-i]+[len_target_word-1-i]+['FUN']+anly) 

		#             if toggle_fun == False
		#                 if entry not in one_ch_fun: # 하나의 자음으로 이뤄진 기능어는 count하지 않는다. '을' 기능어에 'ㄹ'기능어가 또 있다. 이것은 나중에 다시 처리한다.
		#                     toggle_fun = True

		# 2개의 trie를 사용했지만, 하나의 dict에 저장한다.

		#################################################################
		""" 2. 전처리 (entry와 index모두 똑같은 entry-analysis 서로 병합) """
		#################################################################
		# 예를 들어, 엔트리-사전의 '신'과 기능어-사전의 '신'이 index range마저 똑같다면, 하나의 '신'으로 합쳐준다.
		# 그런데, 맨 뒷부분에서 기능어는 1개만 가지기 때문에 엔트리-사전과 겹칠일이 많치는 않다.
		# 예를 들어, 을 같은 경우는 엔트리-사전과 기능어-사전에 동시에 있고 index도 같을 수 있다. (하지만 여기서 엔트리-사전에 '을'이 없다.)
		# 윗 단에서 최대한 경우의 수를 줄여줘야 한다. 그렇지 않으면, 밑으로 갈 수록 경우의 수는 기하급수적으로 늘어날 수 있다.
		# 단, 병합할 때 같은 종류의 엔트리만 고려한다. 즉, FUN, ENT 사이의 병합은 하지 않는다. EX. '을' 같은 경우는 2가지 모두 존재한다.
		for i, _ in enumerate(collect_entry_anly):
			for j in range(0, len(collect_entry_anly)):
				if not (j == i or collect_entry_anly[j][3] != collect_entry_anly[i][3]): 
					# 자기 자신과 똑같은 친구는 빼고. 똑같은 ENT or FUN만 고려한다.
					if collect_entry_anly[i][0] == collect_entry_anly[j][0] and collect_entry_anly[i][1] == collect_entry_anly[j][1]:
						collect_entry_anly[i] += collect_entry_anly[j][4:]
						collect_entry_anly[j].insert(0, 'to-be-deleted')   

		# remove elements, which has 'to-be-deleted' in the first position
		collect_entry_anly = [elem for elem in collect_entry_anly if elem[0] != 'to-be-deleted']

		# ###############################################
		# """ 3. Longest 기능어만 살리기 (경우의 수 줄이기)"""
		# ###############################################
		# # 만약 기능어들이 같은 index range를 공유한다면, 가장 긴 기능어를 살리고 나머진 삭제한다. (우리는 기능어가 1개만 있다고 가정을 하였다.)
		# # 단, 엔트리들은 이런식으로 삭제하면 안된다. 모두 살려두어 나중에 분석결과에 모두 표시해준다.
		# # 예를 들어 'ㄹ' 기능어 (20, 20), '을' 기능어 (18, 20)가 있는 경우 'ㄹ' 기능어를 삭제한다.
		# filtered_fun = [i for i in collect_entry_anly if i[3] == 'FUN'] # filtering only fun
		# filt_sorted_fun = sorted(filtered_fun, key = lambda x: len(split_syllables(x[0])))
		# for i, _ in enumerate(filt_sorted_fun):
		#     target = filt_sorted_fun[i]
		#     for j in range(i+1, len(filt_sorted_fun)):
		#         ref = filt_sorted_fun[j]
		#         if int(ref[1]) <= int(target[1]):# and int(ref[2]) >= int(target[2]):
		#             filt_sorted_fun[i].insert(len(filt_sorted_fun[i]), 'to-be-deleted') # add on last position

		# # remove elements, which has 'to-be-deleted' in the last position
		# filt_sorted_fun = [elem for elem in filt_sorted_fun if elem[-1] != 'to-be-deleted']            

		# # merge
		# filtered_ent = [i for i in collect_entry_anly if i[3] == 'ENT'] # filtering only fun
		# final_collect_entry_anly = filtered_ent + filt_sorted_fun

		final_collect_entry_anly = collect_entry_anly[:]
		###################################################################################
		""" 4. 엔트리 중 내 바로 오른쪽 index가 비어있는 경우 해당 엔트리 지우기 (경우의 수 줄이기)"""
		###################################################################################
		# 예를 들어, 신고전주의정신을 -> 시, 저, 중 은 삭제된다. 다음 index가 비어있기 때문에
		for i, _ in enumerate(final_collect_entry_anly):
			tnk = False
			for j in range(0, len(final_collect_entry_anly)):
				if final_collect_entry_anly[i][2]+1 ==  final_collect_entry_anly[j][1]:
					tnk = True
				elif final_collect_entry_anly[i][2]+1 == len_target_word:
					tnk = True
			if tnk == False:
				final_collect_entry_anly[i].insert(0, 'to-be-deleted')

		# remove elements, which has 'to-be-deleted' in the first position        
		final_collect_entry_anly = [elem for elem in final_collect_entry_anly if elem[0] != 'to-be-deleted']

		#########################################################################
		""" 5. 조합하기 (엔트리 묶음으로 즉, 엔트리 종류는 1개라고 가정한다 여기서는..) """
		#########################################################################
		# 조합하는 과정에서, 띄어쓰기 오류 또는 신조어 오류를 확인할 수 있다.
		# 오류확인 1. sorting 후에 마지막 element의 start index가 target_word의 길이와 맞지 않을 경우.

		# sorting based on start index
		final_collect_entry_anly = sorted(final_collect_entry_anly, key = lambda x: int(x[1]))

		# 오류확인 1. 처음과 끝의 경계는 아래와 같이 체크하면 된다. (중간에 있는 오류는 다음단계에서 확인한다.)
		assert(final_collect_entry_anly[0][1] == 0) # 처음이 index 0으로 시작하지 않을 경우.
		assert(final_collect_entry_anly[-1][2] == len_target_word-1) # 마지막이 word길이만큼의 index가 아닐 경우.

		### merging / combining all cases using recursive function
		list_global_cases, path = [], []
		# filtering only index0 (Starting point to recursive function)
		index0_entry_anly_set = [ele for ele in final_collect_entry_anly if int(ele[1]) == 0] # first position
		# recursive merging
		recursive_merging(final_collect_entry_anly, index0_entry_anly_set, path, list_global_cases)

		# 오류확인 2. 처음부터 끝까지 이어지는 하나의 case도 없을 경우.
		fullpath_check = False
		for i in range(0, len(list_global_cases)):
			if list_global_cases[i][-1][2]+1 == len_target_word:
				fullpath_check = True
		assert(fullpath_check == True) # 오류 난다면 사전 lookup 문제이다
		
		
		### Sequence가 0에서부터 len_target_word-1까지 이어지지 않으면 삭제한다.
		for i, seq in enumerate(list_global_cases):
			if seq[0][1] != 0 or seq[-1][2] != len_target_word-1:
				list_global_cases[i] = 'to-be-deleted'
		# remove elements, which has 'to-be-deleted' in the first position        
		list_global_cases = [elem for elem in list_global_cases if elem != 'to-be-deleted']
		
		#######################################################
		""" 6. 형태소 조합에서 기능어가 있을 때, 마지막에 없는 경우는 삭제하자. """
		#######################################################
		# 즉, FUN이 2번 연속 등장하는 경우는 삭제된다
		for i, case in enumerate(list_global_cases):
			case_len = len(case)
			for j, entry in enumerate(case):
				if case[j][3] == 'FUN':
					if j!=case_len-1: # FUN이 마지막에 있지 않은 경우...
						#print(entry)
						list_global_cases[i] = 'to-be-deleted'
Exemple #11
0
def jamo(x):
    from hangul_utils import split_syllables
    expr = re.findall(ur'[\uac00-\ud7a3]', x, re.UNICODE)
    ls = [u'• ' + split_syllables(i) for i in expr]
    random.shuffle(ls)
    return u'<br>'.join(ls)
Exemple #12
0
        if col == 4:
            r['author'] = sheet.cell(row=row + 1, column=col + 1).value
        elif col == 2:
            r['book'] = sheet.cell(row=row + 1, column=col + 1).value
    init.append(r)

print(init)

cc = 1

for i in init:
    i['author'] = i['author'].replace(" ", "")
    author = i['author']
    book = i['book']

    jamo = split_syllables(author[1])
    jamo_list = list(jamo)

    if jamo_list[0] == 'ㅊ':
        num1 = '8'
        num2 = ch_num2(jamo_list[1])

    elif (jamo_list[0] == 'ㄱ') or (jamo_list[0] == 'ㄲ'):
        num1 = '1'
        num2 = all_num2(jamo_list[1])

    elif jamo_list[0] == 'ㄴ':
        num1 = '19'
        num2 = all_num2(jamo_list[1])

    elif (jamo_list[0] == 'ㄷ') or (jamo_list[0] == 'ㄸ'):
Exemple #13
0
def get_similar_words(model, word, top_n=10):
    jamo = split_syllables(word)
    sim = []
    for (jamo, score) in model.wv.most_similar(jamo, topn=top_n):
        sim.append((join_jamos(jamo), score))
    return sim
Exemple #14
0
def get_word_vector(model, word):
    jamo = split_syllables(word)
    return model.wv[jamo]
Exemple #15
0
model.wv.most_similar('computer', topn=5)

# 어휘 사전 확인
model.wv.vocab['eps']

# hash table (bucket) 확인. subword들의 워드 벡터가 저장된 공간.
model.wv.vectors_ngrams

# TODO: 자소 단위 fasttext
from hangul_utils import split_syllables, join_jamos
from gensim.models.fasttext import FastText
import numpy as np
import pickle

# 한글 자모 분리/합침 연습
jamo = split_syllables('안녕하세요')
word = join_jamos(jamo)
print(jamo)
print(word)

# Commented out IPython magic to ensure Python compatibility.
# 전처리가 완료된 한글 코퍼스를 읽어온다.
# %cd '/content/drive/MyDrive/Colab Notebooks'
with open('data/konovel_preprocessed.pkl', 'rb') as f:
    sentence_list = pickle.load(f)

sentence_list[0]

# sentence_list를 한글 자모로 분리한다.
sentence_jamo = []
for sentence in sentence_list:
Exemple #16
0
from hangul_utils import split_syllables, join_jamos

test_text = "가뷁바라갇ㄹ갇ㅇ"
jamo = split_syllables(test_text)
print(jamo)
Exemple #17
0
def get_jaso(word):
    jaso_list = []
    for char in word:
        for jaso in list(split_syllables(char)):
            jaso_list.append(jaso)
    return jaso_list
def orgin_morph_idx(morph_result, idx):
    new_morph_result = [[]] * len(morph_result)
    
    for i, case in enumerate(morph_result):
        str_idx = idx
        temp = []
        for entry in case:
            origin_morph_info_list = entry[4:] # ['XSN NounV 이/XSN', 'VV VERB-L1 일/VV', 'VV VERB-S1 잇/VV']
            #print(entry)
            if len(origin_morph_info_list) == 1: # '+'이 없는 경우.
                
                if len(entry[-1].split()[-1].split('+'))==1:
                    #print('*** 1 ***')
                    #print(entry[0])
                    len_syllables_fix = len(split_syllables(entry[0]))
                    #print(origin_morph_info_list)
                    #print(entry[0])
                    org_morph = origin_morph_info_list[0].split()[-1].split('/')[0]
                    temp.append([org_morph, str_idx, str_idx+len_syllables_fix-1])
                    str_idx += len_syllables_fix
                    #print('1, temp: ',temp)
                    
                    
                else:
                    #print('*** 2 ***')
                    # 보통 2개로 나눠진다. ['에선', 8, 12, 'FUN', 'J N-1-0 에서/JKB+는/JX']처럼...
                    len_syllables_fix = len(split_syllables(entry[0])) # length 고정
                    str_idx_fix = str_idx
                    
                    plus_entry_list = entry[-1].split()[-1].split('+')
                    #print(entry)
                    #print(plus_entry_list)
                    
                    for m, pair in enumerate(plus_entry_list):
                        temp_length = 1 # +1씩하고 마지막꺼를 len_syllables_fix 길이와 맞추자.
                        org_morph = pair.split('/')[0]

                        if m == len(plus_entry_list)-1:
                            temp.append([org_morph, str_idx, str_idx_fix+len_syllables_fix-1])
                            str_idx = str_idx_fix + len_syllables_fix
                        else:
                            temp.append([org_morph, str_idx, str_idx+temp_length])
                            str_idx += temp_length+1                            
                            
                    #print('2, temp: ',temp)
                            
            else: # ['XSN NounV 이/XSN', 'VV VERB-L1 일/VV']와 같이 2개 이상
                #print('*** 3 ***')
                for k, each_morph_info in enumerate(origin_morph_info_list):
                    org_morph = each_morph_info.split()[-1].split('/')[0]
                    if k==0:
                        len_syllables_fix = len(split_syllables(entry[0]))
                    temp.append([org_morph, str_idx, str_idx+len_syllables_fix-1])
                    if k==len(origin_morph_info_list)-1:
                        str_idx += len_syllables_fix
                        
                    #print('3, temp: ',temp)
            
        new_morph_result[i] = temp
        #print('case', i, ':', temp)
        #print('\n')
        #print('--------------------> ',new_morph_result)
    return new_morph_result
Exemple #19
0
def get_jaso_index(word, jaso_to_index):
    jaso_list = []
    for char in word:
        for jaso in list(split_syllables(char)):
            jaso_list.append(jaso_to_index[jaso])
    return jaso_list
def insert_extdic_to_entTrie(trie):

	workbook = xlrd.open_workbook('assets/voca_set.xlsx') 
	worksheet1 = workbook.sheet_by_name('엔트리사전')
	num_rows = worksheet1.nrows #줄 수 가져오기
	num_cols = worksheet1.ncols #칸 수 가져오기

	neg_vowels = ['ㅣ','ㅡ', 'ㅜ','ㅓ', 'ㅠ', 'ㅕ', 'ㅐ'] # 음성 모음
	pos_vowels = ['ㅗ', 'ㅏ','ㅛ', 'ㅑ'] # 양성 모음

	for i in range(1, num_rows):  
		org_word = worksheet1.cell_value(i, 0)
		pos = worksheet1.cell_value(i, 1) # pos는 그대로 사용.
		org_inf = worksheet1.cell_value(i, 2)
		org_anly = org_word+'/'+pos
		org__info = pos+' '+org_inf+' '+org_anly    
		
		
		# -P 불규칙
		if org_inf == 'VERB-P' or org_inf == 'ADJ-P': # e.g. 눕
			LOG(org__info) # 확인
			### VERB-P1 # e.g. 누우
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			list_word[-1] = '우' # 'ㅂ'을 댄신해서 '우'를 삽입한다.
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			### VERB-P2 # e.g. 누워
			if org_word == '돕' or org_word == '곱': # 예외: 돕 ---VERB-P2---> 도와
				# word
				word = split_syllables(org_word)
				list_word = list(word)
				list_word[-1] = '와'
				word = "".join(list_word)
				word = join_jamos(word)
				# pos -> 그대로 사용.
				# inf
				inf = org_inf + '2'
				# anly
				anly = word+'/'+pos
				_info = pos+' '+inf+' '+anly
				trie.insert(word, _info)            
			else:
				# word
				word = split_syllables(org_word)
				list_word = list(word)
				list_word[-1] = '워'
				word = "".join(list_word)
				word = join_jamos(word)
				# pos -> 그대로 사용.
				# inf
				inf = org_inf + '2'
				# anly
				anly = org_word+'/'+pos
				_info = pos+' '+inf+' '+anly
				trie.insert(word, _info)        
			LOG(word +' ---> '+ _info) # 확인
			LOG('\n')
	 

		if org_inf == 'VERB-T': # e.g. 일컫
			LOG(org__info) # 확인
			### VERB-T1 # e.g. 일컬
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			list_word[-1] = 'ㄹ'
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			LOG('\n')


		# -L 불규칙
		if org_inf == 'VERB-L' or org_inf == 'ADJ-L': # e.g. 이끌
			LOG(org__info)
			### VERB-L1 # e.g. 이끄
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			del list_word[-1]
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			LOG('\n')

			
		# -YE 불규칙
		if org_inf == 'VERB-YE' or org_inf == 'ADJ-YE': # e.g. 난파하
			LOG(org__info)
			### VERB-YE1 # e.g. 난파해
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			list_word[-1] = 'ㅐ'
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			### VERB-YE2 # e.g. 난파하여
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			list_word[-1] = 'ㅏ여'
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '2'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			LOG('\n')


		# -S 불규칙
		if org_inf == 'VERB-S' or org_inf == 'ADJ-S': # e.g. 규정짓
			LOG(org__info)
			### VERB-S1 # e.g. 규정지
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			del list_word[-1]
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			LOG('\n')

		
		# -LU 불규칙
		if org_inf == 'VERB-LU' or org_inf == 'ADJ-LU': # (르불규칙) e.g. 흐르(음성모음), 가르(양성모음) 
			LOG(org__info)
			### VERB-LU1 # e.g. 흘러(음성모음), 갈라(양성모음)
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			# 일단 맨뒤에 있는 '르' 삭제 
			del list_word[-1] 
			del list_word[-1]
			if list_word[-1] in neg_vowels: # 음성모음
				list_word[-1] = list_word[-1] + 'ㄹ러'
			elif list_word[-1] in pos_vowels: # 양성모음
				list_word[-1] = list_word[-1] + 'ㄹ라'
			word = "".join(list_word)
			word = join_jamos(word)
			if org_word == '들르': # 들르 예외처리.
				word = '들러'        
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			LOG('\n')


		# -U 불규칙
		if org_inf == 'VERB-U' or org_inf == 'ADJ-U': # e.g. 갈겨쓰(양성모음), 가냘프(음성모음)
			LOG(org__info)
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			if len(list_word) == 2:
				list_word[-1] = 'ㅓ' # 끄,뜨,크,트는 그냥 모두 음성모음으로 처리
			else:
				del list_word[-1] # '쓰'의 'ㅡ'삭제 
				posneg_vowel = list_word[-3] + list_word[-2] # 이런식으로 segmentation 필요. 항상 [-2]가 모음이 아닐 수도 있다. 
				neg_vowel = list(set(posneg_vowel).intersection(set(neg_vowels)))
				pos_vowel = list(set(posneg_vowel).intersection(set(pos_vowels)))
				if len(neg_vowel) != 0: 
					list_word[-1] = list_word[-1] + 'ㅓ' # 음성모음이면
				else: 
					list_word[-1] = list_word[-1] + 'ㅏ' # 양성모음이면
			word = "".join(list_word)
			word = join_jamos(word)    
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) # 확인
			LOG('\n')


		# -LE 불규칙
		if org_inf == 'VERB-LE' or org_inf == 'ADJ-LE': # (러불규칙) e.g. 이르
			LOG(org__info)
			### VERB-S1 # e.g. 이르러
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			list_word += '러'
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) 
			LOG('\n')


		if org_inf == 'VERB-WU': # e.g. 푸
			LOG(org__info)
			### VERB-WU1 # e.g. 퍼
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			list_word[-1] = 'ㅓ'
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '1'
			# anly
			anly = org_word+'/'+pos
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info)
			LOG(word +' ---> '+ _info) 
			LOG('\n')


		if org_inf == 'ADJ-H': # e.g. 이렇 / 하얗/ 빨갛
			LOG(org__info)
			### ADJ-H1 # e.g. 이러 / 빨가
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			del list_word[-1] # 마지막 글자인 ㅎ 제거
			if not (list_word[-1] == 'ㅑ' or list_word[-1] == 'ㅕ'):
				word = "".join(list_word)
				word = join_jamos(word)
				# pos -> 그대로 사용.
				# inf
				inf = org_inf + '1'
				# anly
				anly = org_word+'/'+pos
				_info = pos+' '+inf+' '+anly
				trie.insert(word, _info)
				LOG(word +' ---> '+ _info)  
			### ADJ-H2 # e.g. 이러 / 빨가
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			del list_word[-1] # 마지막 글자인 ㅎ 제거
			if not (list_word[-1] == 'ㅑ' or list_word[-1] == 'ㅕ'):
				list_word[-1] = 'ㅐ'
			else:
				if list_word[-1] == 'ㅑ':
					list_word[-1] = 'ㅒ'
				else: # 'ㅕ'이면..
					list_word[-1] = 'ㅖ'
			word = "".join(list_word)
			word = join_jamos(word)
			# pos -> 그대로 사용.
			# inf
			inf = org_inf + '2'
			# anly
			anly = org_word+'/'+pos # _info에는 원형단어(org_word)가 들어간다.
			_info = pos+' '+inf+' '+anly
			trie.insert(word, _info) # trie 넣을 때는 변형단어(word)를 넣는다.
			LOG(word +' ---> '+ _info) 
			LOG('\n')


		if org_inf == 'VERB-REG' or org_inf == 'ADJ-REG': # e.g. ㅗ, ㅜ, ㅣ, ㅚ
			### VERB-REG4 # e.g. 나오 -> 나와 
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			if list_word[-1] == 'ㅗ':
				LOG(org__info)
				list_word[-1] = 'ㅘ'
				word = "".join(list_word)
				word = join_jamos(word)
				# pos -> 그대로 사용.
				# inf
				inf = org_inf + '4'
				# anly
				anly = org_word+'/'+pos
				_info = pos+' '+inf+' '+anly
				trie.insert(word, _info)
				LOG(word +' ---> '+ _info) # 확인
				LOG('\n')
			### VERB-REG4 # e.g. 세우 -> 세워 
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			if list_word[-1] == 'ㅜ':
				LOG(org__info)
				list_word[-1] = 'ㅝ'
				word = "".join(list_word)
				word = join_jamos(word)
				# pos -> 그대로 사용.
				# inf
				inf = org_inf + '4'
				# anly
				anly = org_word+'/'+pos
				_info = pos+' '+inf+' '+anly
				trie.insert(word, _info)
				LOG(word +' ---> '+ _info) # 확인
				LOG('\n')
			### VERB-REG4 # e.g. 옮기 -> 옮겨 
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			if list_word[-1] == 'ㅣ':
				LOG(org__info)
				list_word[-1] = 'ㅕ'
				word = "".join(list_word)
				word = join_jamos(word)
				# pos -> 그대로 사용.
				# inf
				inf = org_inf + '4'
				# anly
				anly = org_word+'/'+pos
				_info = pos+' '+inf+' '+anly
				trie.insert(word, _info)
				LOG(word +' ---> '+ _info) # 확인
				LOG('\n')
			### VERB-REG4 # e.g. 사람되 -> 사람돼 
			# word
			word = split_syllables(org_word)
			list_word = list(word)
			if list_word[-1] == 'ㅚ':
				LOG(org__info)
				list_word[-1] = 'ㅙ'
				word = "".join(list_word)
				word = join_jamos(word)
				# pos -> 그대로 사용.
				# inf
				inf = org_inf + '4'
				# anly
				anly = org_word+'/'+pos
				_info = pos+' '+inf+' '+anly
				trie.insert(word, _info)
				LOG(word +' ---> '+ _info) # 확인
				LOG('\n')
				

	return trie
from gensim.models.fasttext import FastText
from hangul_utils import split_syllables, join_jamos
import numpy as np
import pickle

with open('./konovel_preprocessed.pkl', 'rb') as f:
    texts = pickle.load(f)

# 자모 분해 예시
jamo = split_syllables('안녕하세요')
word = join_jamos(jamo)
print(jamo)
print(word)
result = []
tmp = []

for sentence in texts:
    for word in sentence:
        texts_jamo = split_syllables(word)
        tmp.append(texts_jamo)
    result.append(tmp)
    tmp = []
print(result[:1])
model = FastText(size=100, window=5, min_count=10, sentences=result, 
                 iter=200, bucket=2000000, min_n=3, max_n=3, sg=1, negative=2, 
                 sample=1e-5, max_vocab_size=10000)

dic = model.wv.key_to_index
dic['ㅂㅏㄷㅏ']
search = split_syllables('바다')
search
Exemple #22
0
    def parse_dataset(self, filepath):
        token_count = collections.defaultdict(lambda: 0)
        character_count = collections.defaultdict(lambda: 0)
        intent_label_count = collections.defaultdict(lambda: 0)
        slot_label_count = collections.defaultdict(lambda: 0)
        jaso_count = collections.defaultdict(lambda: 0)
        max_len = 0
        max_word_len = 0
        max_char_len = 0
        line_count = -1
        tokens = []
        intent_labels = []
        slot_labels = []
        token_lengths = []
        new_token_sequence = []
        new_intent_label_sequence = []
        new_slot_label_sequence = []
        f = codecs.open(filepath, 'r', 'UTF-8')
        for line in f:
            line_count += 1
            line = line.strip().split(' ')
            if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
                if len(new_token_sequence) > 0:
                    if len(new_token_sequence) > max_len:
                        max_len = len(new_token_sequence)
                    for token in new_token_sequence:
                        if len(token) > max_word_len:
                            max_word_len = len(token)
                        for char in token:
                            if len(split_syllables(char)) > max_char_len:
                                max_char_len = len(split_syllables(char))
                    tokens.append(new_token_sequence)
                    intent_labels.append(new_intent_label_sequence)
                    slot_labels.append(new_slot_label_sequence)
                    new_token_sequence = []
                    new_intent_label_sequence = []
                    new_slot_label_sequence = []
                continue
            if 'ACT' == line[0]:
                intent_label = str(line[-1])
                intent_label_count[intent_label] += 1
                new_intent_label_sequence.append(intent_label)
            else:
                slot_label = str(line[-1])
                slot_label_count[slot_label] += 1
                new_slot_label_sequence.append(slot_label)
            token = str(line[0])
            token_count[token] += 1
            new_token_sequence.append(token)

            for character in token:
                character_count[character] += 1
                for jaso in list(split_syllables(character)):
                    jaso_count[jaso] += 1

        if len(new_token_sequence) > 0:
            if len(new_token_sequence) > max_len:
                max_len = len(new_token_sequence)
            for token in new_token_sequence:
                if len(token) > max_word_len:
                    max_word_len = len(token)
                for char in token:
                    if len(split_syllables(char)) > max_char_len:
                        max_char_len = len(split_syllables(char))
            intent_labels.append(new_intent_label_sequence)
            slot_labels.append(new_slot_label_sequence)
            tokens.append(new_token_sequence)
        return intent_labels, slot_labels, tokens, token_count, intent_label_count, slot_label_count, character_count, jaso_count, max_len, max_word_len, max_char_len