Python hangul_to_jamo Exemples, jamo.hangul_to_jamo Python Exemples

Exemple #1

0

Afficher le fichier

def tokenize(text, as_id=False, symbol_type=1, debug=False):

    j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(
    ), load_symbols_3(), load_symbols_4()

    text = normalize(text)
    pre_tokens = list(hangul_to_jamo(text))
    pre_tokens = [
        hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens
    ]
    tokens = []

    if symbol_type == 1:
        if debug:
            print(char_to_id_1)
        for token in pre_tokens:
            tokens += list(j2hj[token])

        if as_id:
            return [char_to_id_1[token]
                    for token in tokens] + [char_to_id_1[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 2:
        if debug:
            print(char_to_id_2)
        for token in pre_tokens:
            tokens += list(j2hcj[token])

        if as_id:
            return [char_to_id_2[token]
                    for token in tokens] + [char_to_id_2[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 3:
        if debug:
            print(char_to_id_3)
        for token in pre_tokens:
            tokens += list(j2sj[token])

        if as_id:
            return [char_to_id_3[token]
                    for token in tokens] + [char_to_id_3[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 4:
        if debug:
            print(char_to_id_4)
        for token in pre_tokens:
            tokens += list(j2shcj[token])

        if as_id:
            return [char_to_id_4[token]
                    for token in tokens] + [char_to_id_4[EOS]]
        else:
            return [token for token in tokens] + [EOS]

Exemple #2

0

Afficher le fichier

Fichier : korean.py Projet : hash2430/mellotron

def tokenize(text, as_id=False):
    text = normalize(text)
    tokens = list(hangul_to_jamo(text))
    # print(tokens)
    if as_id:
        return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
    else:
        return [token for token in tokens] + [EOS]

Exemple #3

0

Afficher le fichier

def text_to_sequence(text):
    sequence = []
    if not 0x1100 <= ord(text[0]) <= 0x1113:
        text = ''.join(list(hangul_to_jamo(text)))
    for s in text:
        sequence.append(_symbol_to_id[s])
    sequence.append(_symbol_to_id['~'])
    return sequence

Exemple #4

0

Afficher le fichier

def tokenize(text, as_id=False, symbol_type=1, debug=False):

    j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(
    ), load_symbols_3(), load_symbols_4()
    # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.

    text = normalize(text)
    pre_tokens = list(
        hangul_to_jamo(text)
    )  # '존경하는'  --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
    tokens = []
    if symbol_type == 1:
        if debug:
            print(char_to_id_1)
        for token in pre_tokens:
            tokens += list(j2hj[token])

        if as_id:
            return [char_to_id_1[token]
                    for token in tokens] + [char_to_id_1[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 2:
        if debug:
            print(char_to_id_2)
        for token in pre_tokens:
            tokens += list(j2hcj[token])

        if as_id:
            return [char_to_id_2[token]
                    for token in tokens] + [char_to_id_2[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 3:
        if debug:
            print(char_to_id_3)
        for token in pre_tokens:
            tokens += list(j2sj[token])

        if as_id:
            return [char_to_id_3[token]
                    for token in tokens] + [char_to_id_3[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 4:
        if debug:
            print(char_to_id_4)
        for token in pre_tokens:
            tokens += list(j2shcj[token])

        if as_id:
            return [char_to_id_4[token]
                    for token in tokens] + [char_to_id_4[EOS]]
        else:
            return [token for token in tokens] + [EOS]

Exemple #5

0

Afficher le fichier

Fichier : korean.py Projet : zldzmfoq12/GST_Tacotron2

def tokenize(text, as_id=False):
    # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.
    text = normalize(text)
    tokens = list(hangul_to_jamo(text))  # '존경하는'  --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']

    if as_id:
        return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
    else:
        return [token for token in tokens] + [EOS]

Exemple #6

0

Afficher le fichier

def tokenize(text, as_id=False):
    # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.
    text = normalize(text)
    tokens = list(hangul_to_jamo(text))  # '안녕' --> ['ㅇ','ㅏ','ㄴ','ㄴ','ㅕ','ㅇ']

    if as_id:
        return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
    else:
        return [token for token in tokens] + [EOS]

Exemple #7

0

Afficher le fichier

Fichier : test_jamo.py Projet : JDongian/python-jamo

    def test_hangul_to_jamo(self):
        """hangul_to_jamo tests
        Arguments may be iterables or characters.

        hangul_to_jamo should split every Hangul character into U+11xx jamo
        for any given string. Anything else is unchanged.
        """

        test_cases = ["자",
                      "모",
                      "한",
                      "글",
                      "서",
                      "울",
                      "평",
                      "양",
                      "한굴",
                      "Do you speak 한국어?",
                      "자모=字母"]
        desired_jamo = [(chr(0x110c), chr(0x1161)),
                        (chr(0x1106), chr(0x1169)),
                        (chr(0x1112), chr(0x1161), chr(0x11ab)),
                        (chr(0x1100), chr(0x1173), chr(0x11af)),
                        (chr(0x1109), chr(0x1165)),
                        (chr(0x110b), chr(0x116e), chr(0x11af)),
                        (chr(0x1111), chr(0x1167), chr(0x11bc)),
                        (chr(0x110b), chr(0x1163), chr(0x11bc)),
                        (chr(0x1112), chr(0x1161), chr(0x11ab),
                         chr(0x1100), chr(0x116e), chr(0x11af)),
                        tuple(_ for _ in "Do you speak ") +
                        (chr(0x1112), chr(0x1161), chr(0x11ab),
                         chr(0x1100), chr(0x116e), chr(0x11a8),
                         chr(0x110b), chr(0x1165)) + ('?',),
                        (chr(0x110c), chr(0x1161), chr(0x1106), chr(0x1169),
                         "=", "字", "母")]

        for hangul, target in zip(test_cases, desired_jamo):
            trial = jamo.hangul_to_jamo(hangul)
            assert trial.__name__ == "<genexpr>",\
                ("hangul_to_jamo didn't return"
                 "an instance of a generator.")
            trial = tuple(trial)
            assert target == trial,\
                ("Converted {hangul} to {failure}, but expected "
                 "({lead}, {vowel}, "
                 "{tail}).").format(hangul=hangul,
                                    lead=hex(ord(target[0])),
                                    vowel=hex(ord(target[1])),
                                    tail=hex(ord(target[2]))
                                    if len(target) == 3 else "",
                                    failure=tuple([hex(ord(_)) for _ in
                                                  trial]))\
                if len(hangul) == 1 else\
                ("Incorrectly converted {hangul} to "
                 "{failure}.".format(hangul=hangul,
                                     failure=[hex(ord(_)) for _ in trial]))

Exemple #8

0

Afficher le fichier

def to_jamo(data, use_counter=False):
    if use_counter:
        analysis_result = Counter()
    else:
        analysis_result = set()

    for x in tqdm(data):
        analysis_result.update(hangul_to_jamo(x))

    return analysis_result

Exemple #9

0

Afficher le fichier

Fichier : eval.py Projet : back2zion/Tacotron-Korean

def run_eval(args):
    synth = Synthesizer()
    synth.load(args.checkpoint)
    base_path = get_output_base_path(args.checkpoint)
    for i, text in enumerate(sentences):
        jamo = ''.join(list(hangul_to_jamo(text)))
        path = '%s-%d.wav' % (base_path, i)
        print('Synthesizing: %s' % path)
        with open(path, 'wb') as f:
            f.write(synth.synthesize(jamo, base_path, i))

Exemple #10

0

Afficher le fichier

Fichier : encoders.py Projet : TeHikuMedia/reo-toolkit

 def decode(self, encoded_text):
     decoded_sent = ''
     for ch in encoded_text:
         if jamo.is_hangul_char(ch):
             decoded_sent += ''.join(
                 [self.decoder_dict[ch] for ch in jamo.hangul_to_jamo(ch)])
         else:
             decoded_sent += ch
     decoded_sent = decoded_sent.replace('x', '').replace('ŋ',
                                                          'ng').replace(
                                                              'ƒ', 'wh')
     return decoded_sent

Exemple #11

0

Afficher le fichier

Fichier : proc_text.py Projet : h0n9670/tts

def text_to_sequence(text):
    text_filter = "[,./!@#$%^&*()?]"
    text = re.sub(re.compile(text_filter), '', text)
    sequence = []
    if not 0x1100 <= ord(text[0]) <= 0x1113:
        text = ''.join(list(hangul_to_jamo(text)))
    for s in text:
        sequence.append(_symbol_to_id[s])
    sequence.append(_symbol_to_id['~'])
    # ~ 문장 구분자 추가\

    sequence = np.asarray(sequence)
    return sequence

Exemple #12

0

Afficher le fichier

Fichier : test_jamo.py Projet : kyoungrok0517/python-jamo

    def test_hangul_to_jamo(self):
        """hangul_to_jamo tests
        Arguments may be iterables or characters.

        hangul_to_jamo should split every Hangul character into U+11xx jamo
        for any given string. Anything else is unchanged.
        """

        test_cases = [
            "자", "모", "한", "글", "서", "울", "평", "양", "한굴", "Do you speak 한국어?",
            "자모=字母"
        ]
        desired_jamo = [
            (chr(0x110c), chr(0x1161)), (chr(0x1106), chr(0x1169)),
            (chr(0x1112), chr(0x1161), chr(0x11ab)),
            (chr(0x1100), chr(0x1173), chr(0x11af)), (chr(0x1109),
                                                      chr(0x1165)),
            (chr(0x110b), chr(0x116e), chr(0x11af)),
            (chr(0x1111), chr(0x1167), chr(0x11bc)),
            (chr(0x110b), chr(0x1163), chr(0x11bc)),
            (chr(0x1112), chr(0x1161), chr(0x11ab), chr(0x1100), chr(0x116e),
             chr(0x11af)),
            tuple(_ for _ in "Do you speak ") +
            (chr(0x1112), chr(0x1161), chr(0x11ab), chr(0x1100), chr(0x116e),
             chr(0x11a8), chr(0x110b), chr(0x1165)) + ('?', ),
            (chr(0x110c), chr(0x1161), chr(0x1106), chr(0x1169), "=", "字", "母")
        ]

        for hangul, target in zip(test_cases, desired_jamo):
            trial = jamo.hangul_to_jamo(hangul)
            assert trial.__name__ == "<genexpr>",\
                ("hangul_to_jamo didn't return"
                 "an instance of a generator.")
            trial = tuple(trial)
            assert target == trial,\
                ("Converted {hangul} to {failure}, but expected "
                 "({lead}, {vowel}, "
                 "{tail}).").format(hangul=hangul,
                                    lead=hex(ord(target[0])),
                                    vowel=hex(ord(target[1])),
                                    tail=hex(ord(target[2]))
                                    if len(target) == 3 else "",
                                    failure=tuple([hex(ord(_)) for _ in trial]))\
                if len(hangul) == 1 else\
                ("Incorrectly converted {hangul} to "
                 "{failure}.".format(hangul=hangul,
                                     failure=[hex(ord(_)) for _ in trial]))

Exemple #13

0

Afficher le fichier

Fichier : feeder.py Projet : YoungloLee/tf2-speech-recognition-transformer

 def encode(input_path, label_path):
     # Collect signals
     input = np.load(
         os.path.join(input_dir,
                      input_path.numpy().decode('utf8'))).astype('float32')
     # lower frame rate
     input = build_lfr(input)
     # instance normalization
     input = (input - input.mean()) / input.std()
     with open(os.path.join(label_dir,
                            label_path.numpy().decode('utf8')),
               'r',
               encoding='utf-8') as f_in:
         label = f_in.readline()
     if args.token_style == 'jamo':
         label = hangul_to_jamo(label)
         label = np.array([_symbol_to_id[SOS]] +
                          [_symbol_to_id[x] for x in label] +
                          [_symbol_to_id[EOS]]).astype('int32')
     else:
         label = np.array([token_index[SOS]] +
                          [token_index[x] for x in label] +
                          [token_index[EOS]]).astype('int32')
     return input, label

Exemple #14

0

Afficher le fichier

Fichier : cleaners.py Projet : shinewide/sba_speech

def korean_to_jamo(text):
    return "".join(hangul_to_jamo(text))

Exemple #15

0

Afficher le fichier

Fichier : symbols.py Projet : shinewide/sba_speech

_JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
_JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
_JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
_VAILD_JAMO = [jamo for jamo in _JAMO_LEADS + _JAMO_VOWELS + _JAMO_TAILS]

korean_symbol = [_pad] + [_special] + list(_punctuation) + [_space] + _VAILD_JAMO

if __name__ == '__main__':
    print(korean_symbol)
    print(len(korean_symbol))

    symbol_to_id = {s: i for i, s in enumerate(korean_symbol)}

    text = '안녕하세요 3 분반'

    from jamo import hangul_to_jamo
    h2j = "".join(hangul_to_jamo(text))

    print([symbol_to_id[jamo] for jamo in h2j])
    print([jamo for jamo in h2j])

Exemple #16

0

Afficher le fichier

Fichier : test1.py Projet : lightvil/Tacotron-Korean

                    self.model.enc_input: [np.asarray(seq, dtype=np.int32)],
                    self.model.sequence_length:
                    np.asarray([len(seq)], dtype=np.int32),
                    self.model.dec_input:
                    dec_input
                })
            if i < 200:
                dec_input[:, i, :] = mel_out[5 * i - 1, :]
            pred.extend(mel_out[5 * (i - 1):5 * i, :])

        np.save(os.path.join(args.save_dir, 'mel-{}'.format(idx)),
                pred,
                allow_pickle=False)

        input_seq = sequence_to_text(seq)
        alignment_dir = os.path.join(args.save_dir, 'align-{}.png'.format(idx))
        plot_alignment(alignment, alignment_dir, input_seq)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--step', required=True)
    parser.add_argument('--save_dir', default='./output')
    args = parser.parse_args()
    os.makedirs(args.save_dir, exist_ok=True)
    synth = Synthesizer()
    synth.load(args.step)
    for i, text in enumerate(sentences):
        jamo = ''.join(list(hangul_to_jamo(text)))
        synth.synthesize(args, jamo, i)

Exemple #17

0

Afficher le fichier

def tokenize(text, as_id=True, symbol_type=0, debug=False):

    j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(
    ), load_symbols_3(), load_symbols_4()

    text = normalize(text)
    # pre_tokens = list(hangul_to_jamo(text))
    tokens = []

    # symbol_type=1
    if symbol_type:
        pre_tokens = list(hangul_to_jamo(text))
        pre_tokens = [
            hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens
        ]
    else:
        pre_tokens = runKoG2PTest(text, './text/rulebook.txt').split(' ')
        if as_id:
            return [char_to_id_1[pre]
                    for pre in pre_tokens] + [char_to_id_1[EOS]]
        else:
            return [pre for pre in pre_tokens] + [EOS]

    if symbol_type == 1:
        if debug:
            print(char_to_id_1)
        for token in pre_tokens:
            tokens += list(j2hj[token])

        if as_id:
            return [char_to_id_1[token]
                    for token in tokens] + [char_to_id_1[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 2:
        if debug:
            print(char_to_id_2)
        for token in pre_tokens:
            tokens += list(j2hcj[token])

        if as_id:
            return [char_to_id_2[token]
                    for token in tokens] + [char_to_id_2[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 3:
        if debug:
            print(char_to_id_3)
        for token in pre_tokens:
            tokens += list(j2sj[token])

        if as_id:
            return [char_to_id_3[token]
                    for token in tokens] + [char_to_id_3[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 4:
        if debug:
            print(char_to_id_4)
        for token in pre_tokens:
            tokens += list(j2shcj[token])

        if as_id:
            return [char_to_id_4[token]
                    for token in tokens] + [char_to_id_4[EOS]]
        else:
            return [token for token in tokens] + [EOS]

Exemple #18

0

Afficher le fichier

Fichier : text.py Projet : wjy5446/tacotron-pytorch

def tokenize(text):
    text = normalize(text)
    tokens = list(hangul_to_jamo(text))

    return tokens + ['~']

Exemple #19

0

Afficher le fichier

        print(text)
        print(normalize(text))
        print("=" * 30)

    # test_normalize("제 전화번호는 01012345678이에요.")
    # test_normalize("60 대 30으로")
    # test_normalize("2020년 월드컵에서는 한국74이 4강")
    # test_normalize("3개월 전에 골프를 치다가")
    # test_normalize("1025호실 환자")
    # test_normalize("2013년에는 작은 아파트에 대한")
    # test_normalize("국어 시험에서 80점을 받았어요.")
    # test_normalize('근처에 24시간 여는 슈퍼마켓 있나요?')
    # test_normalize('지금은 23시10분 입니다')
    test_normalize('아버지는 20살 때부터 버스를 모셨다.')
    # test_normalize("이 상자는 가로 30, 세로 50, 높이 20센티다.")
    # test_normalize("3, 6, 9 게임 아세요?")
    # test_normalize("남은 시간이 6개월이래요")
    # test_normalize("36 개월 할부")
    # test_normalize("114에 전화를 해서 번호를 알아보시지 그러세요?")
    # test_normalize("축구에서 한 팀은 11명으로 이루어진다.")
    # test_normalize("그 연극은 5월 1일부터 10월 31일까지 월요일을 제외하고 매일 공연됩니다.")
    # test_normalize("우리의 목표는 에너지 소비를 10% 줄이는 것입니다.")
    # test_normalize('5 시 36분 32초')
    # test_normalize('2 명 입니다')
    # test_normalize('3명 입니다')
    # test_normalize("mp3 파일을 홈페이지에서 다운로드 받으시기 바랍니다.")
    # test_normalize("오늘(13일) 3,600마리 강아지가")
    # test_normalize("33001명의 사람이 모였습니다")
    # test_normalize("60.3%")
    print(list(hangul_to_jamo(list(hangul_to_jamo('남은 시간이 "6개월이래요”')))))

Exemple #20

0

Afficher le fichier

 def text_to_idx(self, text):
     return [
         self.symbol_dic[token] for token in hangul_to_jamo(text)
         if token in self.symbol_dic
     ]

Exemple #21

0

Afficher le fichier

    if not is_count and kor.startswith("일") and len(kor) > 1:
        kor = kor[1:]

    if float_str is not None:
        kor += "쩜 "
        kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)

    if num_str.startswith("+"):
        kor = "플러스 " + kor
    elif num_str.startswith("-"):
        kor = "마이너스 " + kor

    return kor + unit_str


if __name__ == "__main__":

    def test_normalize(text):
        print(text)
        print(normalize(text))
        print("=" * 30)

    test_normalize("어제 미술관 옆 동물원에 갔어요.")
    test_normalize("오늘(13일) 3,600마리 강아지가")
    test_normalize("60.3%")
    test_normalize('"저돌"(猪突) 입니다.')
    test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”')
    test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다")
    test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")
    print(list(hangul_to_jamo(list(hangul_to_jamo('어제 미술관 옆 동물원에 갔어요.')))))

Exemple #22

0

Afficher le fichier

Fichier : align2transcript.py Projet : cydream/multi-speaker-tacotron-tensorflow

import json
import csv
from jamo import hangul_to_jamo
import argparse
from utils import load_json
from text.korean import normalize

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--alignment_path', required=True)
    parser.add_argument('--remove_prefix', required=True)
    config = parser.parse_args()

    data = load_json(config.alignment_path, encoding="utf8")
    out_txt = config.alignment_path.replace('alignment.json','transcript.txt')
    f = csv.writer(open(out_txt, "w"), delimiter='|')

    for file in data:
        filename = file.replace(config.remove_prefix, '')
        text = data[file]
        norm = normalize(text)
        decomp = list(hangul_to_jamo(norm))
        
        f.writerow([filename, text, norm, ''.join(decomp), "0.0"])

Exemple #23

0

Afficher le fichier

Fichier : phoneme_tokenizer.py Projet : akreal/espnet

 def _text_to_jaso(self, line: str) -> List[str]:
     jasos = list(jamo.hangul_to_jamo(line))
     return jasos

Exemple #24

0

Afficher le fichier

Fichier : korean.py Projet : yehudasa/MelNet

        kor += "쩜 "
        kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)

    if num_str.startswith("+"):
        kor = "플러스 " + kor
    elif num_str.startswith("-"):
        kor = "마이너스 " + kor

    return kor + unit_str


if __name__ == "__main__":

    def test_normalize(text):
        print(text)
        print(normalize(text))
        print("=" * 30)

    test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute")
    test_normalize("오늘(13일) 3,600마리 강아지가")
    test_normalize("60.3%")
    test_normalize('"저돌"(猪突) 입니다.')
    test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”')
    test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다")
    test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")
    print(
        list(
            hangul_to_jamo(
                list(
                    hangul_to_jamo(
                        '비대위원장이 지난 1월 이런 말을 했습니다? “난 그냥 산돼지처럼 돌파하는 스타일이다”')))))

Exemple #25

0

Afficher le fichier

def tokenize(text, as_id=False, symbol_type=1, debug=False):

    j2hj, j2hcj, j2sj, j2shcj = load_symbols_1(), load_symbols_2(), load_symbols_3(), load_symbols_4()

    text = normalize(text)
    pre_tokens = list(hangul_to_jamo(text))
    pre_tokens = [hcj_to_jamo(_, "lead") if is_hcj(_) else _ for _ in pre_tokens]
    tokens = []
    if symbol_type == 1:
        if debug:
            print(char_to_id_1)
        for token in pre_tokens:
            #token = token.encode('utf-8','ignore')
             
            token = token.replace('\u201d',' ')
            token = token.replace('\u2026',' ')
            token = token.replace('\u2018',' ')
            token = token.replace('\u201c',' ')
            token = token.replace('\u2019',' ')
            token = token.replace('\xe1\x84\x8b',' ') 	
            token = token.replace('\xb7',' ') 
            token = token.replace('\xa0',' ') 
            tokens += list(j2hj[token])
            
        if as_id:
            return [char_to_id_1[token] for token in tokens] + [char_to_id_1[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 2:
        if debug:
            print(char_to_id_2)
        for token in pre_tokens:
            tokens += list(j2hcj[token])

        if as_id:
            return [char_to_id_2[token] for token in tokens] + [char_to_id_2[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 3:
        if debug:
            print(char_to_id_3)
        for token in pre_tokens:
            tokens += list(j2sj[token])

        if as_id:
            return [char_to_id_3[token] for token in tokens] + [char_to_id_3[EOS]]
        else:
            return [token for token in tokens] + [EOS]

    elif symbol_type == 4:
        if debug:
            print(char_to_id_4)
        for token in pre_tokens:
            tokens += list(j2shcj[token])

        if as_id:
            return [char_to_id_4[token] for token in tokens] + [char_to_id_4[EOS]]
        else:
            return [token for token in tokens] + [EOS]