Example #1
0
    def _load_dataset(self):
        #         file_list = os.listdir('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop')
        file_list = os.listdir(
            '/Data/FoodDetection/data/text_recognition/Korean/public_crop')

        dataset = []
        for file_name in file_list:

            #             img = os.path.join('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop/', file_name)
            img = os.path.join(
                '/Data/FoodDetection/data/text_recognition/Korean/public_crop/',
                file_name)
            # 세로글자 인경우 제외
            #             h, w, c = np.asarray(img).shape
            #             if h > w :
            #                 continue

            label = file_name.replace('.jpg', '').replace(' ', '')
            continue_flag = False

            if self.mode == 'jamo':

                label_split = j2hcj(h2j(label))
                # 특수문자 ㅗ
                for char in label_split:
                    if char not in jamo_printable:
                        continue_flag = True

                if continue_flag:
                    continue

                top_tmp = []
                middle_tmp = []
                bottom_tmp = []

                for char in label:
                    decomposed = j2hcj(h2j(char))
                    for i, label in enumerate(
                        [top_tmp, middle_tmp, bottom_tmp]):
                        try:
                            label.append(decomposed[i])
                        except:
                            label.append(' ')


#                 for img, top, middle, bottom in zip(img, top_tmp, middle_tmp, bottom_tmp):
                dataset.append([img, top_tmp, middle_tmp, bottom_tmp])

            elif self.mode == 'syllable':
                #                 label = list(label)
                for syllable in label:
                    if syllable not in syllable_printable:
                        continue_flag = True

                if continue_flag:
                    continue

                dataset.append([img, label])

        return dataset
Example #2
0
    def combine(self, verb, ending, rule):
        if not rule:
            return []

        stop, postfix, start = rule.split(",")
        stop = None if stop == "" else int(stop)
        start = None if start == "" else int(start)

        # STEP 1. Decompose verb
        verb = h2j(verb) # h: hangul syl. j: jamo

        # STEP 2. Slice 1
        verb = verb[:stop]

        # STEP 3. Merge 2 and postfix
        wordform = verb + postfix

        # STEP 4. Decompose ending
        ending = h2j(ending)
        ending = "".join(hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending)

        # STEP 5. Slice 4
        ending = ending[start:]

        # STEP 6. Merge 3 and 5
        wordform +="|" + ending

        # STEP 7. Compose 6
        wordform = self.compose(wordform)

        return wordform
def hangul_to_sequence(hangul_text):
    # load conversion dictionaries
    ### clean number
    hangul_text_ = date_to_hangul(hangul_text)
    hangul_text_ = number_to_hangul(hangul_text_)
    hangul_text_ = clean_text(hangul_text_)
    ### add end of sentence symbol
    hangul_text_ = hangul_text_ + u"␃"  # ␃: EOS
    ### get dictionary of chars
    hangul_to_ids= _symbol_to_id
    ### process jamos
    text = [h2j(char) for char in hangul_text_]
    text = chain.from_iterable(text)
    hangul_text_ = [h2j(char) for char in text]
    hangul_text_ = chain.from_iterable(hangul_text_)
    sequence = []
    try:
        ### convert jamos to ids using dictionary
        for char in hangul_text_:
            if char in symbols:
                sequence.append(hangul_to_ids[char])
            else:
                try:
                    print(char)
                    sequence.append(hangul_to_ids[symbols[hangul_symbol_hcj.index(char)]])
                except Exception as e:
                    sequence.append(hangul_to_ids['.'])
    except KeyError as e:
        raise KeyError('KeyError (at key: {}) when processing: {}'.format(e,hangul_text))
    return sequence
Example #4
0
async def on_message(message):
    if message.author.id in playing and message.author.id != client.user.id and message.channel.id == user[message.author.id]['channel']:
        async with message.channel.typing():
            await asyncio.sleep(random.randint(0, config['timeover']*300) / 1000)
        jamo_txt = str(jamo.j2hcj(jamo.h2j(user[message.author.id]['this'][-1])))
        if jamo_txt.startswith("ㄹ"):
            jamo_char = [user[message.author.id]['this'][-1], hangulutils.join_jamos("ㄴ"+str(jamo_txt[1:]))]
        else:
            jamo_char = message.content[0]
        if user[message.author.id]['this'][-1] in jamo_char:
            if not message.content in user[message.author.id]['used']:
                if message.content in word:
                    temp = []
                    jamo_char = []
                    try:
                        jamo_txt = str(jamo.j2hcj(jamo.h2j(message.content[-1])))
                        if jamo_txt.startswith("ㄹ"):
                            jamo_char = [message.content[-1], hangulutils.join_jamos("ㅇ"+str(jamo_txt[1:]))]
                            for i in range(len(word)):
                                if word[i][0] in jamo_char:
                                    temp.append(word[i])
                        else:
                            for i in range(len(word)):
                                if word[i].startswith(message.content[-1]):
                                    temp.append(word[i])
                        user[message.author.id]['used'].append(message.content)
                        user[message.author.id]['this'] = temp[random.randint(0, len(temp))]
                        if message.author.id in playing:
                            await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**")
                            user[message.author.id]['used'].append(user[message.author.id]['this'])
                            user[message.author.id]['count'] = user[message.author.id]['count'] + 1
                            await wait(user[message.author.id]['count'], message.author.id, message)
                    except Exception as ex:
                        if message.author.id in playing:
                            playing.remove(message.author.id)
                        if user[message.author.id]['count']:
                            embed = discord.Embed(title='게임승리', description=f"{message.author.display_name}\n`{str(user[message.author.id]['count'])}`")
                        await message.channel.send(embed=embed)    

            else:
                await message.channel.send("이미 사용한 단어자나요 :thinking:")

    if message.content.startswith(config['prefix']+"끝말"):
        if not message.author.id in playing:
            playing.append(message.author.id)
            user[message.author.id] = {}
            user[message.author.id]['used'] = []
            user[message.author.id]['this'] = []
            user[message.author.id]['this'] = ""
            user[message.author.id]['this'] = word[random.randint(0, len(word))]
            await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**")
            user[message.author.id]['used'].append(user[message.author.id]['this'])
            user[message.author.id]['channel'] = message.channel.id
            user[message.author.id]['count'] = 0
            user[message.author.id]['status'] = 0
            await wait(user[message.author.id]['count'], message.author.id, message)

        else:
            await message.channel.send("이미 게임중이잖아요!\n뭐하는거시에오 ㅇ0ㅇㅠㅠㅠ")
Example #5
0
def MypartFunction(request):
    startword = request.GET['startword']
    user_log = request.session.get('user')
    user_id = request.session.get('user_id')

    user = Tuser.objects.get(user_id=user_id)
    ureview2 = Treview.objects.filter(treviewid=user_id)
    print(ureview2)
    ureview = []
    for i in ureview2:
        #         print(j2hcj(h2j(i.tourid.tourname))[0], startword)
        if j2hcj(h2j(i.tourid.tourname))[0] == startword:
            print(j2hcj(h2j(i.tourid.tourname))[0])
            ureview.append(i)
        if startword == "*":
            if j2hcj(h2j(i.tourid.tourname))[0] not in [
                    'ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅅ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ',
                    'ㅍ', 'ㅎ', 'ㄲ', 'ㄸ', 'ㅆ', 'ㅉ', 'ㅃ'
            ]:
                print(j2hcj(h2j(i.tourid.tourname)))
                ureview.append(i)
    print(ureview)

    #     print(ureview)
    paginator = Paginator(ureview, 20)
    page = request.GET.get('page')

    try:
        data = paginator.page(page)
    except PageNotAnInteger:
        data = paginator.page(1)
    except EmptyPage:
        data = paginator.page(paginator.num_pages)  # num_pages = 0
    print(data)
    ## 개별 페이지 표시용
    allpage = range(paginator.num_pages + 1)

    #     return render(request, 'board.html', {'data':data, 'allpage':allpage})
    urls = []
    for ur in ureview:
        urdic = {
            'tourid': ur.tourid.tourid,
            'tourname': ur.tourid.tourname,
            'area': ur.tourid.city + " " + ur.tourid.town,
            'rating': ur.rating
        }
        urls.append(urdic)

    context = {
        'data': data,
        'allpage': allpage,
        'w': startword,
        'user': user,
        'user_log': user_log,
        'user_id': user_id
    }
    return render(request, 'mypartreview.html', context)
Example #6
0
def load_data(mode="train"):
    '''Loads data
      Args:
          mode: "train" or "synthesize".
    '''
    # Load vocabulary
    char2idx, idx2char = load_vocab()

    # load conversion dictionaries
    j2hcj, j2sj, j2shcj = load_j2hcj(), load_j2sj(), load_j2shcj()

    # Parse
    fpaths, text_lengths, texts = [], [], []
    transcript = os.path.join(hp.data, 'jss.v1.0.txt')
    lines = codecs.open(transcript, 'r', 'utf-8').readlines()
    if mode == "train":
        lines = lines[:-100]
    else:
        lines = lines[-100:]

    for line in lines:
        fname, text = line.strip().split("|")
        fpath = os.path.join(hp.data, fname)
        fpaths.append(fpath)

        text += "␃"  # ␃: EOS
        if hp.token_type == "char":  # syllable
            text = list(text)
        else:
            text = [h2j(char) for char in text]
            text = chain.from_iterable(text)
            if hp.token_type == "j":  # jamo
                text = [h2j(char) for char in text]
            elif hp.token_type == "sj":  # single jamo
                text = [j2sj.get(j, j) for j in text]
            elif hp.token_type == "hcj":  # hangul compatibility jamo
                text = [j2hcj.get(j, j) for j in text]
            elif hp.token_type == "shcj":  # single hangul compatibility jamo
                text = [j2shcj.get(j, j) for j in text]
        text = chain.from_iterable(text)

        text = [char2idx[char] for char in text if char in char2idx]
        text_lengths.append(len(text))
        if mode == "train":
            texts.append(np.array(text, np.int32).tostring())
        else:
            texts.append(text + [0] * (hp.max_N - len(text)))

    return fpaths, text_lengths, texts
Example #7
0
def jamo_to_korean(text):
    text = h2j(text)

    idx = 0
    new_text = ""
    candidates = []

    while True:
        if idx >= len(text):
            new_text += _get_text_from_candidates(candidates)
            break

        char = text[idx]
        mode = get_mode(char)

        if mode == 0:
            new_text += _get_text_from_candidates(candidates)
            candidates = [char]
        elif mode == -1:
            new_text += _get_text_from_candidates(candidates)
            new_text += char
            candidates = []
        else:
            candidates.append(char)

        idx += 1
    return new_text
Example #8
0
def con_menu(post):
    #post내에 있는 nng들을 모두 가지는 nng_list생성
    nng_list = []
    ####nn = ['NNG','NNP','NNB','NP']####
    for word in api.analyze(post):
        for morph in word.morphs:
            if morph.tag == 'NNG':
                nng_list.append(morph.lex)

    #nng_list내에 디저트 메뉴사전에 있는 단어가 3개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break
    count = 0
    nng_count = 0
    while True:
        if nng_count >= 3:
            return 1
            break
        elif count >= len(nng_list):
            return 0
            break

        nng_name = nng_list[count]
        nng_first = j2hcj(h2j(nng_name))[0]
        if nng_name in menu_dic[nng_first]:
            nng_count += 1
        count += 1
Example #9
0
def create_phoneme_dictionary(source_path):
	grapheme_dict, phoneme_dict = {}, {}
	for lab_file in tqdm(glob(get_path(source_path, "**", "*.lab"))):
		sentence = read_file(lab_file)
		word_list = sentence.split(" ")
		grapheme_list = h2j(sentence).split(" ")
		phoneme_list = h2j(g2p(sentence)).split(" ")

		for idx, word in enumerate(word_list):
			if not word in grapheme_dict.keys():
				grapheme_dict[word] = " ".join(grapheme_list[idx])

			if not word in phoneme_dict.keys():
				phoneme_dict[word] = " ".join(phoneme_list[idx])

	return grapheme_dict, phoneme_dict
Example #10
0
def con_verb(post):
    #post내에 있는 동사들을 모두 가지는 verb_list생성
    verb_list = []
    need_mm = ['VV', 'VA', 'VX']
    for word in api.analyze(post):
        for morph in word.morphs:
            if morph.tag in need_mm:
                verb_list.append(morph.lex)

    #nng_list내에 디저트 메뉴사전에 있는 단어가 2개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break
    count = 0
    verb_count = 0
    while True:
        if verb_count >= 2:
            return 1
            break
        elif count >= len(verb_list):
            return 0
            break

        verb = verb_list[count]
        verb_first = j2hcj(h2j(verb))[0]
        if verb in verb_dic[verb_first]:
            verb_count += 1
        count += 1
Example #11
0
def plot(alignment, info, text):
    char_len, audio_len = alignment.shape # 145, 200

    fig, ax = plt.subplots(figsize=(char_len/5, 5))
    im = ax.imshow(
            alignment.T,
            aspect='auto',
            origin='lower',
            interpolation='none')

    xlabel = 'Encoder timestep'
    ylabel = 'Decoder timestep'

    if info is not None:
        xlabel += '\n{}'.format(info)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    if text:
        jamo_text = j2hcj(h2j(normalize(text)))
        pad = [PAD] * (char_len - len(jamo_text) - 1)

        plt.xticks(range(char_len),
                [tok for tok in jamo_text] + [EOS] + pad)

    if text is not None:
        while True:
            if text[-1] in [EOS, PAD]:
                text = text[:-1]
            else:
                break
        plt.title(text)

    plt.tight_layout()
Example #12
0
    def page_text_finder(self, report_text):
        page_text = ''
        text = ''
        found = False

        company_name = self.file_nm.split('_')[3]
        company_num = self.file_nm.split('_')[4][1:]

        company_dict = {'LG상사': 'LG 상사'}

        # To resolve hangul encoding issue
        company_name = hangul.join_jamos(j2hcj(h2j(company_name)))

        if company_name in company_dict.keys():
            company_name = company_dict[company_name]

        for line in report_text.split('\n'):
            if "page_id" in line and '||Title||  ' + company_name in text and company_num in text:
                page_text = text
                found = True
                break

            elif "page_id" in line:
                text = ''
            else:
                text += line + '\n'

        return page_text, found, company_name, company_num
def get_prefix_list(word, prefix_length):
    prefix_list = list()
    word = word[:prefix_length]
    alphabets = j2hcj(h2j(word))
    for i in range(0, len(alphabets)):
        prefix_list.append(alphabets[:i + 1])
    return prefix_list
def find_complement(input_string):  # ('되다'의 경우 현재 보격 조사 판별 X)
    temp_string = input_string
    complementArr = []
    N_cnt = 0
    for i in range(len(temp_string)):
        if temp_string[i][1].find('JKC') != -1:  # 형태소 분석을 한 결과에서 보격 조사를 찾음
            for j in range(0, i):  # 문장 처음부터 보격 조사 까지
                N_cnt = 0
                if (temp_string[j][1] == 'NNG' or temp_string[j][1] == 'NNP'
                        or temp_string[j][1] == 'NNB'
                        or temp_string[j][1] == 'NP'):
                    N_cnt = j  # 보격 조사에 가장 가까운 명사를 찾아서
            for k in range(N_cnt, i + 1):  #명사부터 보격 조사까지
                complementArr.append(temp_string[k])  # 저장
        if temp_string[i][1].find('JKS') != -1:
            do_jamo = j2hcj(h2j(temp_string[i + 1][0]))
            if (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅚ') or \
                    (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅙ'):
                for j in range(0, i):  # 문장 처음부터 보격 조사 까지
                    N_cnt = 0
                    if (temp_string[j][1] == 'NNG'
                            or temp_string[j][1] == 'NNP'
                            or temp_string[j][1] == 'NNB'
                            or temp_string[j][1] == 'NP'):
                        N_cnt = j  # 보격 조사에 가장 가까운 명사를 찾아서
                for k in range(N_cnt, i + 1):  # 명사부터 보격 조사까지
                    complementArr.append(temp_string[k])  # 저장

    return complementArr  # 한 문장 안에 보어가 여러 개가 될 수 있으므로 list의 형식으로 값을 반환
Example #15
0
    def pack_samples(batch):
        # Return val
        b_as_char_tensor = []
        b_as_jamo_tensor = []

        for e in batch:
            e_char_seq = [
                torch.LongTensor([c2i[c] for c in tok])
                for tok in e[0].split()
            ]
            e_jamo_seq = [
                torch.LongTensor([j2i[j] for j in jamo.j2hcj(jamo.h2j(tok))])
                for tok in e[0].split()
            ]

            b_as_char_tensor.append(e_char_seq)
            b_as_jamo_tensor.append(e_jamo_seq)

        b_lens = [len(t) for t in b_as_char_tensor]

        b_ch_padded = nn.utils.rnn.pad_sequence(sum(b_as_char_tensor, []),
                                                batch_first=True)
        b_jm_padded = nn.utils.rnn.pad_sequence(sum(b_as_jamo_tensor, []),
                                                batch_first=True)

        b_as_char_tensor = [
            b_ch_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens)
        ]
        b_as_jamo_tensor = [
            b_jm_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens)
        ]

        b_as_char_tensor = nn.utils.rnn.pad_sequence(b_as_char_tensor,
                                                     batch_first=True)
        b_as_jamo_tensor = nn.utils.rnn.pad_sequence(b_as_jamo_tensor,
                                                     batch_first=True)

        assert b_as_char_tensor.shape[0] == b_as_char_tensor.shape[
            0]  # Same batch size
        assert b_as_char_tensor.shape[1] == b_as_char_tensor.shape[
            1]  # Same max token count
        assert b_as_jamo_tensor.shape[0] == b_as_jamo_tensor.shape[0]
        assert b_as_jamo_tensor.shape[1] == b_as_jamo_tensor.shape[1]

        if batch[0][1] is not None:
            b_scores = torch.FloatTensor([float(e[1]) for e in batch])
        else:
            b_scores = None

        if len(cuda_device) > 0:
            b_as_char_tensor = b_as_char_tensor.to(f"cuda:{cuda_device[0]}")
            b_as_jamo_tensor = b_as_jamo_tensor.to(f"cuda:{cuda_device[0]}")

            if b_scores is not None:
                b_scores = b_scores.to(f"cuda:{cuda_device[0]}")

        b_lens = torch.LongTensor(b_lens)

        return b_as_char_tensor, b_as_jamo_tensor, b_lens, b_scores
def save_to_txt(file_nm, file_text):
    root_dir = '/Users/daniel/Desktop/test_2/after_inspec_txt/'
    path = root_dir + file_nm
    path = hangul.join_jamos(j2hcj(h2j(path)))
    print(file_nm)

    with open(path, 'w') as out_file:
        out_file.write(file_text)
Example #17
0
def string2jamo(string, letter=False):
    """Convert Korean string into Hangul Jamo sequence
    Args:
      letter : If true, return in Hangul compatibility Jamo.
    """
    jamos = h2j(string)
    if letter:
        return ''.join([conv_hcj(c) for c in jamos])
    return jamos
def get_jongsung_TF(sample_word): 
  sample_text_list = list(sample_word) 
  last_word = sample_text_list[-1] 
  last_word_jamo_list = list(j2hcj(h2j(last_word))) 
  last_jamo = last_word_jamo_list[-1] 
  jongsung_TF = "T" 
  if last_jamo in ['ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ', 'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ','ㅔ', 'ㅟ', 'ㅖ', 'ㅒ']: 
    jongsung_TF = "F" 
  return jongsung_TF
Example #19
0
def inflect(verb, ending, rule):
    if not rule:
        return []
    verb = h2j(verb)
    ending = h2j(ending)
    ending = "".join(
        hcj_to_jamo(char, "tail") if is_hcj(char) else char for char in ending)
    rules = rule[1:-1].split("/")
    forms = []
    for rule in rules:
        end, insertion, start = rule.split(",")

        end = int(end) if not end == "" else 100
        start = int(start) if not start == "" else 0
        form = verb[:end] + insertion + ending[start:]
        form = j2syl(form)
        forms.append(form)
    return forms
Example #20
0
def get_jongsung_TF(sentence):
    sentence = list(sentence)
    last_word = sentence[-1]
    last_word = list(j2hcj(h2j(last_word)))
    jongsung = "T"
    if last_word[-1] in ('ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ',
                         'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ', 'ㅔ', 'ㅟ', 'ㅖ', 'ㅒ', '2',
                         '4', '5', '9'):
        jongsung = "F"

    return jongsung
Example #21
0
def count_con_vow_num_spe(sentence):
    sentence = j2hcj(h2j(sentence))
    # print(sentence)

    # 초성 리스트
    CHOSUNG_LIST = [
        'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ',
        'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
    ]
    # 중성 리스트
    JUNGSUNG_LIST = [
        'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ',
        'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
    ]
    # 종성 리스트
    JONGSUNG_LIST = [
        'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ',
        'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
    ]
    # 숫자 리스트
    NUMBER_LIST = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    # 특수문자 리스트
    SPECIAL_LIST = [
        '~', '@', '#', '$', '%', '&', '*', '(', ')', '_', '-', '+', '=', '+',
        '-', '`', ';', "'", ':', '>', '<', '/'
    ]

    count_consonant = []
    count_vowel = []
    count_number = []
    count_special = []

    for word in sentence:
        if word in CHOSUNG_LIST or word in JONGSUNG_LIST:
            count_consonant.append(word)
        elif word in JUNGSUNG_LIST:
            count_vowel.append(word)
        elif word in NUMBER_LIST:
            count_number.append(word)
        elif word in SPECIAL_LIST:
            count_special.append(word)

            #숫자로 끝나는지 여부 체크
        end_with_number_flag = 0

        if sentence[len(sentence) - 1] in NUMBER_LIST:
            end_with_number_flag = 1

    count_consonant = len(count_consonant)
    count_vowel = len(count_vowel)
    count_number = end_with_number_flag
    count_special = len(count_special)

    return count_consonant, count_vowel, count_number, count_special
Example #22
0
def write_multispeaker_emotion_metadata(source_path, savepath, speaker_dict):
	"""
		save-format
			filename | transcript | transcript_jamo | transcript_phoneme | speaker_label | emotion_label
				=> LJ-Speech-styled metadata format
	"""
	contents = ""

	for lab_file in tqdm(glob(get_path(source_path, "**", "*.lab"))):

		filename = lab_file.split("/")[-1].replace("lab", "wav")
		transcript = read_file(lab_file)
		transcript_jamo = h2j(transcript)
		transcript_phoneme = h2j(g2p(transcript))
		speaker_label = speaker_dict[filename[:3]]
		emotion_label = "{:05d}".format(int(lab_file.replace(".lab", "")[-5:]) - 1)[-3]

		contents += "{}|{}|{}|{}|{}|{}\n".format(filename, transcript, transcript_jamo, transcript_phoneme, speaker_label, emotion_label)

	with open(savepath, "w", encoding='utf-8') as f:
		f.write(contents)
def plot_alignment(alignment, path, info=None, text=None, isKorean=True):

    if text:
        tmp_alignment = alignment[:len(h2j(text)) + 2]

        plot(tmp_alignment, info, text, isKorean)
        plt.savefig(path, format='png')
    else:
        plot(alignment, info, text, isKorean)
        plt.savefig(path, format='png')

    print(" [*] Plot saved: {}".format(path))
Example #24
0
def create_batch_inputs_from_texts(tests):
	sequences = [text_to_sequence(text) for in texts]

	inputs = _prepare_inputs(sequences)
	input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32)

	for idx, (seq, text) in enumerate(zip(inputs, texts)):
		recovered_text = sequence_to_text(seq, skip_eos_and_pad=True)
		if recovered text != h2j(text):
			log(" [{}] {}".format(idx, text))
			log(" [{}] {}".format(idx, recovered_text))
			log("="*30)
Example #25
0
def splitOffFinalJamo(c):
    assert len(c) == 1
    assert isHangul(c)
    # important to check if there even is a tail!
    # otherwise, the following call won't work
    if jamoTail(c) > 0:
        # get the compatibility Jamo
        finalJamo = j2hcj(h2j(c)[-1])
        lead, vowel = jamoLead(c), jamoVowel(c)
        return assembleHangul(lead, vowel, 0) + finalJamo
    else:
        return c  # null final: nothing to split off
Example #26
0
def decomposition(sentence):
    sentence = j2hcj(h2j(sentence))
    index = []
    for item in sentence:
        if (not isHangul(item) and item.isalpha()):
            index.insert(-1, sentence.find(item))
            break
    if (len(index)):
        part1 = list(sentence[:index[0] - 1])
        part2 = sentence[index[0]:].split()
        return ''.join((part1 + part2))
    else:
        return sentence
Example #27
0
def plot_alignment(
        alignment, path, info=None, text=None, isKorean=True):

    if text:  # text = '대체 투입되었던 구급대원이'
        tmp_alignment = alignment[:len(h2j(text)) + 2]  # '대체 투입되었던 구급대원이' 푼 후, 길이 측정  <--- padding제거 효과

        plot(tmp_alignment, info, text, isKorean)
        plt.savefig(path, format='png')
    else:
        plot(alignment, info, text, isKorean)
        plt.savefig(path, format='png')

    print(" [*] Plot saved: {}".format(path))
Example #28
0
def read_kss_meta(path):
    # Parse
    char2idx, _ = load_vocab_tool('ko')
    meta = pd.read_table(path, sep='|', header=None)
    meta.columns = ['fpath', 'ori', 'expanded', 'decomposed', 'duration', 'en']
    fpaths, texts = [], []
    meta.expanded = 'P' + meta.expanded + 'E'
    for fpath, text in zip(meta.fpath.values, meta.expanded.values):
        t = np.array([char2idx[ch] for ch in jamo.h2j(text)])
        f = os.path.join(os.path.basename(fpath).replace('wav', 'npy'))
        texts.append(t)
        fpaths.append(f)
    return fpaths, texts, texts
def find_tense(sentence):
    tense_table = [[
        'past',
    ], [
        'present',
    ], [
        'future',
    ]]  # 문자열과 시제를 함께 저장할 테이블
    # ____________________________
    # | past(0행)   |  문장  |  ...
    # | __________________________
    # | present(1행)|  문장  |  ...
    # | __________________________
    # | future(2행) |  문장  |  ...
    # | __________________________

    special_future = 0  # '것','이'를 처리하기 위한 변수
    is_present_flag = True  # 현재시제 판단 위한 변수
    for i in range(len(sentence)):
        # 미래시제 1: '것''이'
        if sentence[i][1].find('NNB') != -1 and sentence[i][0].find('것') != -1:
            do_jamo = j2hcj(h2j(sentence[i - 1][0]))  # jamo를 이용해 분리(할->ㅎㅏㄹ)
            if len(do_jamo
                   ) > 2 and do_jamo[2] == 'ㄹ':  # 종성이 있고, -ㄹ 것이 가 미래형으로 구분
                special_future = special_future + 1  # NNB 는 '것'이므로 ++함
        if sentence[i][1].find('VCP') != -1 and sentence[i][0].find('이') != -1:
            special_future = special_future + 1  # VCP 는 '이'이므로 ++함
        if special_future == 2:  # '것'과 '이'가 모두 존재하면 미래 시제로 판단
            tense_table[2].append(sentence)
            is_present_flag = False
            break
        # 높임 표현(시, 십, 세, 심, 실)의 경우 처리
        if sentence[i][1].find('EP') != -1 \
                and not sentence[i][0].find('시') != -1 \
                and not sentence[i][0].find('십') != -1 \
                and not sentence[i][0].find('세') != -1 \
                and not sentence[i][0].find('실') != -1 \
                and not sentence[i][0].find('심') != -1:
            # 미래시제 2: '겠'
            if sentence[i][0].find('겠') != -1:
                tense_table[2].append(sentence)
                is_present_flag = False
            # 과거시제
            else:
                tense_table[0].append(sentence)
                is_present_flag = False
            break
    # 현재시제
    if is_present_flag == True:
        tense_table[1].append(sentence)
    return tense_table
Example #30
0
def create_batch_inputs_from_texts(texts):  # create_batch_inputs_from_texts 함수 define
    sequences = [text_to_sequence(text) for text in texts]  # 받은 값을 전부 text_to_sequence함수 위치 : text/__init__.py

    inputs = _prepare_inputs(sequences)
    input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32) # input_length는 inputs의 원소의 갯수

    for idx, (seq, text) in enumerate(zip(inputs, texts)):
        recovered_text = sequence_to_text(seq, skip_eos_and_pad=True)
        if recovered_text != h2j(text):
            log(" [{}] {}".format(idx, text))
            log(" [{}] {}".format(idx, recovered_text))
            log("="*30)

    return inputs, input_lengths
def dividehangul(string):
    realletter = 0
    realtail = 0
    headcounts = defaultdict(int)
    vowelcounts = defaultdict(int)
    tailcounts = defaultdict(int)
    headfound = set()
    vowelfound = set()
    tailfound = set()

    for letter in string:
        parts = jamo.j2hcj(jamo.h2j(letter))
        if len(parts) > 2:
            head = parts[0]
            vowel = parts[1]
            tail = parts[2]
            realletter += 1#realletter equals realvowel
            realtail += 1#find list of jamo
            headfound.add(head)
            vowelfound.add(vowel)
            tailfound.add(tail)
            headcounts[head] += 1
            vowelcounts[vowel] += 1
            tailcounts[tail] += 1

        elif len(parts) > 1:
            head = parts[0]
            vowel = parts[1]
            realletter += 1
            headfound.add(head)
            vowelfound.add(vowel)
            headcounts[head] += 1
            vowelcounts[vowel] += 1

    headp = {}
    vowelp = {}
    tailp = {}

    with codecs.open('headjamo.txt', encoding='utf-8', mode='r') as f:
        for x in f.read().strip():
            headp[x] = headcounts[x] / realletter if realletter != 0 else 0
    with codecs.open('voweljamo.txt', encoding='utf-8', mode='r') as f:
        for x in f.read().strip():
            vowelp[x] = vowelcounts[x] / realletter if realletter != 0 else 0
    with codecs.open('tailjamo.txt', encoding='utf-8', mode='r') as f:
        for x in f.read().strip():
            tailp[x] = tailcounts[x] / realtail if realtail != 0 else 0
    return (headp, vowelp, tailp)
Example #32
0
    def test_h2j(self):
        """h2j tests
        Arguments may be iterables or characters.

        h2j should split every Hangul character into U+11xx jamo for any given
        string. Anything else is unchanged.
        """
        tests = ["한굴", "자모=字母"]
        targets = ["한굴", "자모=字母"]
        tests_idempotent = ["", "test123~", "ㄱㄲㄴㄷㆆㅿ"]
        targets_idempotent = tests_idempotent

        all_tests = itertools.chain(zip(tests, targets),
                                    zip(tests_idempotent, targets_idempotent))

        for test, target in all_tests:
            trial = jamo.h2j(test)
            assert trial == target,\
                ("Converted {test} to {trial}, but "
                 "expected {target}.").format(test=test,
                                              trial=trial,
                                              target=target)
Example #33
0
import sys
from jamo import h2j, j2hcj
from collections import Counter

string = sys.stdin.readline().strip()

divided = []
for x in j2hcj(h2j(string)):
    divided.append(x)

counts = Counter()
for letter in divided:
    counts[letter] += 1

print(counts)