Ejemplo n.º 1
0
    def jamo(self, sentences):
        result = []
        for sentence in sentences:
            chars = self.korean.sub('', jamotools.split_syllables(sentence))
            result.append(list(chars))

        return result
Ejemplo n.º 2
0
def load_volume(PATH, vocabulary):
    file = open(PATH, "r", encoding='UTF-8')
    text = file.read()
    split = jamotools.split_syllables(text)
    indexes = []
    for i in split:
        c = vocabulary.char2index(i)
        if c == -1:
            continue
        else:
            indexes.append(c)
    return np.array(indexes)
Ejemplo n.º 3
0
def wordedits(word):
    """ Splits the Korean characters into letters, and simply calls all the other functions 
	getting all inserts, removals, etc. in one list.  It then joins the characters back together
	and returns all of the edits in a single list """

    editsret = []
    alleditsjoined = []
    splitword = jamotools.split_syllables(word)
    alleditssplit = list(
        inserts(splitword) + removals(splitword) + swaps(splitword) +
        replaces(splitword))
    for w in alleditssplit:
        alleditsjoined.append(jamotools.join_jamos(w))
    print(alleditsjoined)
    realedits = dictionarycomparer(alleditsjoined)
    return (realedits)
Ejemplo n.º 4
0
 def _preprocess_korean(self, sent, null='ⅇ'):
     sent = sent.lower()
     sent = re.sub(r'[^가-힣\s\.\,\?\!]', '', sent)
     seq = []
     for c in list(sent):
         if re.match(r'[가-힣]', c):
             jamos = list(
                 jamotools.split_syllables(c))  # use this for positionless
             if jamos[0] == 'ㅇ':  # the 'positionless' nieung
                 jamos = [null] + jamos[1:]
             # print(jamos)
             seq += jamos
         else:
             if c == ' ':
                 c = '▁'
             seq.append(c)
     texts = np.zeros((1, hp.max_N), np.int32)
     texts[0, :len(seq)] = [self.char2idx[char] for char in seq]
     return texts
Ejemplo n.º 5
0
def txtfile2npyfile(input_path, output_path, vocabulary):

    file_counter = 1
    for root, dirs, files in os.walk(input_path):
        rootpath = os.path.join(os.path.abspath(input_path), root)
        for file in files:
            int_array = []
            filepath = os.path.join(rootpath, file)
            f = open(filepath, encoding="UTF-8")
            for line in f:
                split = jamotools.split_syllables(line)
                for c in split:
                    index = vocabulary.char2index(c)
                    if index > 0:
                        int_array.append(index)
            file_name = "wiki_korean_{0:04}".format(file_counter) + ".npy"
            save_path =os.path.join(os.path.abspath(output_path),file_name)
            np.save(save_path, np.array(int_array))
            print("txtfile2npyfile convert to : {0}".format(file_name)+" done ")
            file_counter = file_counter + 1
Ejemplo n.º 6
0
def testmodel2(epoch, logs):
    if epoch % 5 != 0 and epoch != 99:
        return

    test_sentence = train_text[:48]
    test_sentence = jamotools.split_syllables(test_sentence)

    next_chars = 300
    for _ in range(next_chars):
        test_text_X = test_sentence[-seq_length:]
        test_text_X = np.array([
            char2idx[c] if c in char2idx else char2idx['UNK']
            for c in test_text_X
        ])
        test_text_X = pad_sequences([test_text_X],
                                    maxlen=seq_length,
                                    padding='pre',
                                    value=char2idx['UNK'])
        output_idx = model.predict_classes(test_text_X)
        test_sentence += idx2char[output_idx[0]]

    print()
    print(jamotools.join_jamos(test_sentence))
    print()
Ejemplo n.º 7
0
 def test_split_syllables(self, input, output, jamo_type):
     pred = jamotools.split_syllables(input, jamo_type=jamo_type)
     output = ''.join([_hex_string_to_str(h) for h in output])
     self.assertEqual(pred, output)
Ejemplo n.º 8
0
 def jamochar(self, char):
     char = self.korean.sub('', jamotools.split_syllables(char))
     return char
Ejemplo n.º 9
0
def text2encoding(text):
    text = jamotools.split_syllables(text, jamo_type="JAMO")
    return [encoding_dict[char] for char in text]
Ejemplo n.º 10
0
def ota_translater(word):
    param = jamotools.split_syllables(word)
    result = correct(param)
    return jamotools.join_jamos(result)
Ejemplo n.º 11
0
# 자모 분리 테스트
import jamotools
import tensorflow as tf
import numpy as np

path_to_file = tf.keras.utils.get_file(
    'toji.txt',
    'https://raw.githubusercontent.com/pykwon/etc/master/rnn_test_toji.txt')
#path_to_file = 'silrok.txt'
train_text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
s = train_text[:100]
print(s)

# 한글 텍스트를 자모 단위로 분리. 한자 등에는 영향 X
s_split = jamotools.split_syllables(s)  # 100글자의 한글이 자모 단위로 분리됨
print(s_split)

# 자모 결합 테스트
s2 = jamotools.join_jamos(s_split)
print(s2)  # 결합된 결과
print(s == s2)  # True 분리 전후의 문장이 비교 결과 같음

# 자모 토큰화 : 텍스트를 자모 단위로 나눕니다. 지연 시간 필요.
train_text_X = jamotools.split_syllables(train_text)
vocab = sorted(set(train_text_X))
vocab.append('UNK')  # 사전에 정의되지 않은 기호가 있을 수 있으므로 'UNK'도 사전에 넣음
print('{} unique characters'.format(len(vocab)))  # 179 unique characters

# vocab list를 숫자로 맵핑하고, 반대도 실행.
char2idx = {u: i for i, u in enumerate(vocab)}
Ejemplo n.º 12
0
import jamotools

region_words = [
    '일월', '이월', '삼월', '사월', '오월', '육월', '칠월', '팔월', '구월', '십월', '십일월', '십이월'
]
# '이일', '삼일', '사일', '오일', '육일', '칠일', '팔일', '구일', '십일', '십일일', '일일', ]

f = open("date_words.txt", 'w')
for v in region_words:
    data = jamotools.split_syllables(v) + '\n'
    f.write(data)

f.close()