Exemple #1
0
    def _load_dataset(self):
        #         file_list = os.listdir('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop')
        file_list = os.listdir(
            '/Data/FoodDetection/data/text_recognition/Korean/public_crop')

        dataset = []
        for file_name in file_list:

            #             img = os.path.join('/home/Data/FoodDetection/data/text_recognition/Korean/public_crop/', file_name)
            img = os.path.join(
                '/Data/FoodDetection/data/text_recognition/Korean/public_crop/',
                file_name)
            # 세로글자 인경우 제외
            #             h, w, c = np.asarray(img).shape
            #             if h > w :
            #                 continue

            label = file_name.replace('.jpg', '').replace(' ', '')
            continue_flag = False

            if self.mode == 'jamo':

                label_split = j2hcj(h2j(label))
                # 특수문자 ㅗ
                for char in label_split:
                    if char not in jamo_printable:
                        continue_flag = True

                if continue_flag:
                    continue

                top_tmp = []
                middle_tmp = []
                bottom_tmp = []

                for char in label:
                    decomposed = j2hcj(h2j(char))
                    for i, label in enumerate(
                        [top_tmp, middle_tmp, bottom_tmp]):
                        try:
                            label.append(decomposed[i])
                        except:
                            label.append(' ')


#                 for img, top, middle, bottom in zip(img, top_tmp, middle_tmp, bottom_tmp):
                dataset.append([img, top_tmp, middle_tmp, bottom_tmp])

            elif self.mode == 'syllable':
                #                 label = list(label)
                for syllable in label:
                    if syllable not in syllable_printable:
                        continue_flag = True

                if continue_flag:
                    continue

                dataset.append([img, label])

        return dataset
Exemple #2
0
async def on_message(message):
    if message.author.id in playing and message.author.id != client.user.id and message.channel.id == user[message.author.id]['channel']:
        async with message.channel.typing():
            await asyncio.sleep(random.randint(0, config['timeover']*300) / 1000)
        jamo_txt = str(jamo.j2hcj(jamo.h2j(user[message.author.id]['this'][-1])))
        if jamo_txt.startswith("ㄹ"):
            jamo_char = [user[message.author.id]['this'][-1], hangulutils.join_jamos("ㄴ"+str(jamo_txt[1:]))]
        else:
            jamo_char = message.content[0]
        if user[message.author.id]['this'][-1] in jamo_char:
            if not message.content in user[message.author.id]['used']:
                if message.content in word:
                    temp = []
                    jamo_char = []
                    try:
                        jamo_txt = str(jamo.j2hcj(jamo.h2j(message.content[-1])))
                        if jamo_txt.startswith("ㄹ"):
                            jamo_char = [message.content[-1], hangulutils.join_jamos("ㅇ"+str(jamo_txt[1:]))]
                            for i in range(len(word)):
                                if word[i][0] in jamo_char:
                                    temp.append(word[i])
                        else:
                            for i in range(len(word)):
                                if word[i].startswith(message.content[-1]):
                                    temp.append(word[i])
                        user[message.author.id]['used'].append(message.content)
                        user[message.author.id]['this'] = temp[random.randint(0, len(temp))]
                        if message.author.id in playing:
                            await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**")
                            user[message.author.id]['used'].append(user[message.author.id]['this'])
                            user[message.author.id]['count'] = user[message.author.id]['count'] + 1
                            await wait(user[message.author.id]['count'], message.author.id, message)
                    except Exception as ex:
                        if message.author.id in playing:
                            playing.remove(message.author.id)
                        if user[message.author.id]['count']:
                            embed = discord.Embed(title='게임승리', description=f"{message.author.display_name}\n`{str(user[message.author.id]['count'])}`")
                        await message.channel.send(embed=embed)    

            else:
                await message.channel.send("이미 사용한 단어자나요 :thinking:")

    if message.content.startswith(config['prefix']+"끝말"):
        if not message.author.id in playing:
            playing.append(message.author.id)
            user[message.author.id] = {}
            user[message.author.id]['used'] = []
            user[message.author.id]['this'] = []
            user[message.author.id]['this'] = ""
            user[message.author.id]['this'] = word[random.randint(0, len(word))]
            await message.channel.send("`"+message.author.display_name+"`\n**"+user[message.author.id]['this']+"**")
            user[message.author.id]['used'].append(user[message.author.id]['this'])
            user[message.author.id]['channel'] = message.channel.id
            user[message.author.id]['count'] = 0
            user[message.author.id]['status'] = 0
            await wait(user[message.author.id]['count'], message.author.id, message)

        else:
            await message.channel.send("이미 게임중이잖아요!\n뭐하는거시에오 ㅇ0ㅇㅠㅠㅠ")
def MypartFunction(request):
    startword = request.GET['startword']
    user_log = request.session.get('user')
    user_id = request.session.get('user_id')

    user = Tuser.objects.get(user_id=user_id)
    ureview2 = Treview.objects.filter(treviewid=user_id)
    print(ureview2)
    ureview = []
    for i in ureview2:
        #         print(j2hcj(h2j(i.tourid.tourname))[0], startword)
        if j2hcj(h2j(i.tourid.tourname))[0] == startword:
            print(j2hcj(h2j(i.tourid.tourname))[0])
            ureview.append(i)
        if startword == "*":
            if j2hcj(h2j(i.tourid.tourname))[0] not in [
                    'ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅅ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ',
                    'ㅍ', 'ㅎ', 'ㄲ', 'ㄸ', 'ㅆ', 'ㅉ', 'ㅃ'
            ]:
                print(j2hcj(h2j(i.tourid.tourname)))
                ureview.append(i)
    print(ureview)

    #     print(ureview)
    paginator = Paginator(ureview, 20)
    page = request.GET.get('page')

    try:
        data = paginator.page(page)
    except PageNotAnInteger:
        data = paginator.page(1)
    except EmptyPage:
        data = paginator.page(paginator.num_pages)  # num_pages = 0
    print(data)
    ## 개별 페이지 표시용
    allpage = range(paginator.num_pages + 1)

    #     return render(request, 'board.html', {'data':data, 'allpage':allpage})
    urls = []
    for ur in ureview:
        urdic = {
            'tourid': ur.tourid.tourid,
            'tourname': ur.tourid.tourname,
            'area': ur.tourid.city + " " + ur.tourid.town,
            'rating': ur.rating
        }
        urls.append(urdic)

    context = {
        'data': data,
        'allpage': allpage,
        'w': startword,
        'user': user,
        'user_log': user_log,
        'user_id': user_id
    }
    return render(request, 'mypartreview.html', context)
def get_prefix_list(word, prefix_length):
    prefix_list = list()
    word = word[:prefix_length]
    alphabets = j2hcj(h2j(word))
    for i in range(0, len(alphabets)):
        prefix_list.append(alphabets[:i + 1])
    return prefix_list
Exemple #5
0
def con_verb(post):
    #post내에 있는 동사들을 모두 가지는 verb_list생성
    verb_list = []
    need_mm = ['VV', 'VA', 'VX']
    for word in api.analyze(post):
        for morph in word.morphs:
            if morph.tag in need_mm:
                verb_list.append(morph.lex)

    #nng_list내에 디저트 메뉴사전에 있는 단어가 2개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break
    count = 0
    verb_count = 0
    while True:
        if verb_count >= 2:
            return 1
            break
        elif count >= len(verb_list):
            return 0
            break

        verb = verb_list[count]
        verb_first = j2hcj(h2j(verb))[0]
        if verb in verb_dic[verb_first]:
            verb_count += 1
        count += 1
def find_complement(input_string):  # ('되다'의 경우 현재 보격 조사 판별 X)
    temp_string = input_string
    complementArr = []
    N_cnt = 0
    for i in range(len(temp_string)):
        if temp_string[i][1].find('JKC') != -1:  # 형태소 분석을 한 결과에서 보격 조사를 찾음
            for j in range(0, i):  # 문장 처음부터 보격 조사 까지
                N_cnt = 0
                if (temp_string[j][1] == 'NNG' or temp_string[j][1] == 'NNP'
                        or temp_string[j][1] == 'NNB'
                        or temp_string[j][1] == 'NP'):
                    N_cnt = j  # 보격 조사에 가장 가까운 명사를 찾아서
            for k in range(N_cnt, i + 1):  #명사부터 보격 조사까지
                complementArr.append(temp_string[k])  # 저장
        if temp_string[i][1].find('JKS') != -1:
            do_jamo = j2hcj(h2j(temp_string[i + 1][0]))
            if (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅚ') or \
                    (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅙ'):
                for j in range(0, i):  # 문장 처음부터 보격 조사 까지
                    N_cnt = 0
                    if (temp_string[j][1] == 'NNG'
                            or temp_string[j][1] == 'NNP'
                            or temp_string[j][1] == 'NNB'
                            or temp_string[j][1] == 'NP'):
                        N_cnt = j  # 보격 조사에 가장 가까운 명사를 찾아서
                for k in range(N_cnt, i + 1):  # 명사부터 보격 조사까지
                    complementArr.append(temp_string[k])  # 저장

    return complementArr  # 한 문장 안에 보어가 여러 개가 될 수 있으므로 list의 형식으로 값을 반환
Exemple #7
0
def con_menu(post):
    #post내에 있는 nng들을 모두 가지는 nng_list생성
    nng_list = []
    ####nn = ['NNG','NNP','NNB','NP']####
    for word in api.analyze(post):
        for morph in word.morphs:
            if morph.tag == 'NNG':
                nng_list.append(morph.lex)

    #nng_list내에 디저트 메뉴사전에 있는 단어가 3개 이상이면 nng_count=1, 단어가 하나도 없다면 0으로 break
    count = 0
    nng_count = 0
    while True:
        if nng_count >= 3:
            return 1
            break
        elif count >= len(nng_list):
            return 0
            break

        nng_name = nng_list[count]
        nng_first = j2hcj(h2j(nng_name))[0]
        if nng_name in menu_dic[nng_first]:
            nng_count += 1
        count += 1
    def page_text_finder(self, report_text):
        page_text = ''
        text = ''
        found = False

        company_name = self.file_nm.split('_')[3]
        company_num = self.file_nm.split('_')[4][1:]

        company_dict = {'LG상사': 'LG 상사'}

        # To resolve hangul encoding issue
        company_name = hangul.join_jamos(j2hcj(h2j(company_name)))

        if company_name in company_dict.keys():
            company_name = company_dict[company_name]

        for line in report_text.split('\n'):
            if "page_id" in line and '||Title||  ' + company_name in text and company_num in text:
                page_text = text
                found = True
                break

            elif "page_id" in line:
                text = ''
            else:
                text += line + '\n'

        return page_text, found, company_name, company_num
Exemple #9
0
def plot(alignment, info, text):
    char_len, audio_len = alignment.shape # 145, 200

    fig, ax = plt.subplots(figsize=(char_len/5, 5))
    im = ax.imshow(
            alignment.T,
            aspect='auto',
            origin='lower',
            interpolation='none')

    xlabel = 'Encoder timestep'
    ylabel = 'Decoder timestep'

    if info is not None:
        xlabel += '\n{}'.format(info)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    if text:
        jamo_text = j2hcj(h2j(normalize(text)))
        pad = [PAD] * (char_len - len(jamo_text) - 1)

        plt.xticks(range(char_len),
                [tok for tok in jamo_text] + [EOS] + pad)

    if text is not None:
        while True:
            if text[-1] in [EOS, PAD]:
                text = text[:-1]
            else:
                break
        plt.title(text)

    plt.tight_layout()
    def pack_samples(batch):
        # Return val
        b_as_char_tensor = []
        b_as_jamo_tensor = []

        for e in batch:
            e_char_seq = [
                torch.LongTensor([c2i[c] for c in tok])
                for tok in e[0].split()
            ]
            e_jamo_seq = [
                torch.LongTensor([j2i[j] for j in jamo.j2hcj(jamo.h2j(tok))])
                for tok in e[0].split()
            ]

            b_as_char_tensor.append(e_char_seq)
            b_as_jamo_tensor.append(e_jamo_seq)

        b_lens = [len(t) for t in b_as_char_tensor]

        b_ch_padded = nn.utils.rnn.pad_sequence(sum(b_as_char_tensor, []),
                                                batch_first=True)
        b_jm_padded = nn.utils.rnn.pad_sequence(sum(b_as_jamo_tensor, []),
                                                batch_first=True)

        b_as_char_tensor = [
            b_ch_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens)
        ]
        b_as_jamo_tensor = [
            b_jm_padded[x - y:x] for x, y in zip(accumulate(b_lens), b_lens)
        ]

        b_as_char_tensor = nn.utils.rnn.pad_sequence(b_as_char_tensor,
                                                     batch_first=True)
        b_as_jamo_tensor = nn.utils.rnn.pad_sequence(b_as_jamo_tensor,
                                                     batch_first=True)

        assert b_as_char_tensor.shape[0] == b_as_char_tensor.shape[
            0]  # Same batch size
        assert b_as_char_tensor.shape[1] == b_as_char_tensor.shape[
            1]  # Same max token count
        assert b_as_jamo_tensor.shape[0] == b_as_jamo_tensor.shape[0]
        assert b_as_jamo_tensor.shape[1] == b_as_jamo_tensor.shape[1]

        if batch[0][1] is not None:
            b_scores = torch.FloatTensor([float(e[1]) for e in batch])
        else:
            b_scores = None

        if len(cuda_device) > 0:
            b_as_char_tensor = b_as_char_tensor.to(f"cuda:{cuda_device[0]}")
            b_as_jamo_tensor = b_as_jamo_tensor.to(f"cuda:{cuda_device[0]}")

            if b_scores is not None:
                b_scores = b_scores.to(f"cuda:{cuda_device[0]}")

        b_lens = torch.LongTensor(b_lens)

        return b_as_char_tensor, b_as_jamo_tensor, b_lens, b_scores
def save_to_txt(file_nm, file_text):
    root_dir = '/Users/daniel/Desktop/test_2/after_inspec_txt/'
    path = root_dir + file_nm
    path = hangul.join_jamos(j2hcj(h2j(path)))
    print(file_nm)

    with open(path, 'w') as out_file:
        out_file.write(file_text)
def normalizeToCompatJamo(s):
    out = ''
    for c in s:
        if isNonCompatibilityJamo(c):
            out += j2hcj(c)
        else:
            out += c
    assert len(s) == len(out)
    return out
def get_jongsung_TF(sample_word): 
  sample_text_list = list(sample_word) 
  last_word = sample_text_list[-1] 
  last_word_jamo_list = list(j2hcj(h2j(last_word))) 
  last_jamo = last_word_jamo_list[-1] 
  jongsung_TF = "T" 
  if last_jamo in ['ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ', 'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ','ㅔ', 'ㅟ', 'ㅖ', 'ㅒ']: 
    jongsung_TF = "F" 
  return jongsung_TF
def get_jongsung_TF(sentence):
    sentence = list(sentence)
    last_word = sentence[-1]
    last_word = list(j2hcj(h2j(last_word)))
    jongsung = "T"
    if last_word[-1] in ('ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅘ',
                         'ㅚ', 'ㅙ', 'ㅝ', 'ㅞ', 'ㅢ', 'ㅐ', 'ㅔ', 'ㅟ', 'ㅖ', 'ㅒ', '2',
                         '4', '5', '9'):
        jongsung = "F"

    return jongsung
Exemple #15
0
def count_con_vow_num_spe(sentence):
    sentence = j2hcj(h2j(sentence))
    # print(sentence)

    # 초성 리스트
    CHOSUNG_LIST = [
        'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ',
        'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
    ]
    # 중성 리스트
    JUNGSUNG_LIST = [
        'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ',
        'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
    ]
    # 종성 리스트
    JONGSUNG_LIST = [
        'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ',
        'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
    ]
    # 숫자 리스트
    NUMBER_LIST = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    # 특수문자 리스트
    SPECIAL_LIST = [
        '~', '@', '#', '$', '%', '&', '*', '(', ')', '_', '-', '+', '=', '+',
        '-', '`', ';', "'", ':', '>', '<', '/'
    ]

    count_consonant = []
    count_vowel = []
    count_number = []
    count_special = []

    for word in sentence:
        if word in CHOSUNG_LIST or word in JONGSUNG_LIST:
            count_consonant.append(word)
        elif word in JUNGSUNG_LIST:
            count_vowel.append(word)
        elif word in NUMBER_LIST:
            count_number.append(word)
        elif word in SPECIAL_LIST:
            count_special.append(word)

            #숫자로 끝나는지 여부 체크
        end_with_number_flag = 0

        if sentence[len(sentence) - 1] in NUMBER_LIST:
            end_with_number_flag = 1

    count_consonant = len(count_consonant)
    count_vowel = len(count_vowel)
    count_number = end_with_number_flag
    count_special = len(count_special)

    return count_consonant, count_vowel, count_number, count_special
Exemple #16
0
def splitOffFinalJamo(c):
    assert len(c) == 1
    assert isHangul(c)
    # important to check if there even is a tail!
    # otherwise, the following call won't work
    if jamoTail(c) > 0:
        # get the compatibility Jamo
        finalJamo = j2hcj(h2j(c)[-1])
        lead, vowel = jamoLead(c), jamoVowel(c)
        return assembleHangul(lead, vowel, 0) + finalJamo
    else:
        return c  # null final: nothing to split off
Exemple #17
0
def decomposition(sentence):
    sentence = j2hcj(h2j(sentence))
    index = []
    for item in sentence:
        if (not isHangul(item) and item.isalpha()):
            index.insert(-1, sentence.find(item))
            break
    if (len(index)):
        part1 = list(sentence[:index[0] - 1])
        part2 = sentence[index[0]:].split()
        return ''.join((part1 + part2))
    else:
        return sentence
def find_tense(sentence):
    tense_table = [[
        'past',
    ], [
        'present',
    ], [
        'future',
    ]]  # 문자열과 시제를 함께 저장할 테이블
    # ____________________________
    # | past(0행)   |  문장  |  ...
    # | __________________________
    # | present(1행)|  문장  |  ...
    # | __________________________
    # | future(2행) |  문장  |  ...
    # | __________________________

    special_future = 0  # '것','이'를 처리하기 위한 변수
    is_present_flag = True  # 현재시제 판단 위한 변수
    for i in range(len(sentence)):
        # 미래시제 1: '것''이'
        if sentence[i][1].find('NNB') != -1 and sentence[i][0].find('것') != -1:
            do_jamo = j2hcj(h2j(sentence[i - 1][0]))  # jamo를 이용해 분리(할->ㅎㅏㄹ)
            if len(do_jamo
                   ) > 2 and do_jamo[2] == 'ㄹ':  # 종성이 있고, -ㄹ 것이 가 미래형으로 구분
                special_future = special_future + 1  # NNB 는 '것'이므로 ++함
        if sentence[i][1].find('VCP') != -1 and sentence[i][0].find('이') != -1:
            special_future = special_future + 1  # VCP 는 '이'이므로 ++함
        if special_future == 2:  # '것'과 '이'가 모두 존재하면 미래 시제로 판단
            tense_table[2].append(sentence)
            is_present_flag = False
            break
        # 높임 표현(시, 십, 세, 심, 실)의 경우 처리
        if sentence[i][1].find('EP') != -1 \
                and not sentence[i][0].find('시') != -1 \
                and not sentence[i][0].find('십') != -1 \
                and not sentence[i][0].find('세') != -1 \
                and not sentence[i][0].find('실') != -1 \
                and not sentence[i][0].find('심') != -1:
            # 미래시제 2: '겠'
            if sentence[i][0].find('겠') != -1:
                tense_table[2].append(sentence)
                is_present_flag = False
            # 과거시제
            else:
                tense_table[0].append(sentence)
                is_present_flag = False
            break
    # 현재시제
    if is_present_flag == True:
        tense_table[1].append(sentence)
    return tense_table
Exemple #19
0
def AE_irregularOperation_1(c):
    assert len(c) == 1
    assert isHangul(c)
    # important to check if there even is a tail!
    # otherwise, the following call won't work
    # only for 애
    if jamoVowel(c) == 2:
        # get the compatibility Jamo
        finalJamo = j2hcj(h2j(c)[-1])
        lead, tail = jamoLead(c), jamoTail(c)
        # lead null consonant is 12
        # 애->아+아, or 앴->아+았
        return assembleHangul(lead, 1, 0) + assembleHangul(12, 1, tail)
    else:
        return c  # null final: nothing to split off
def dividehangul(string):
    realletter = 0
    realtail = 0
    headcounts = defaultdict(int)
    vowelcounts = defaultdict(int)
    tailcounts = defaultdict(int)
    headfound = set()
    vowelfound = set()
    tailfound = set()

    for letter in string:
        parts = jamo.j2hcj(jamo.h2j(letter))
        if len(parts) > 2:
            head = parts[0]
            vowel = parts[1]
            tail = parts[2]
            realletter += 1#realletter equals realvowel
            realtail += 1#find list of jamo
            headfound.add(head)
            vowelfound.add(vowel)
            tailfound.add(tail)
            headcounts[head] += 1
            vowelcounts[vowel] += 1
            tailcounts[tail] += 1

        elif len(parts) > 1:
            head = parts[0]
            vowel = parts[1]
            realletter += 1
            headfound.add(head)
            vowelfound.add(vowel)
            headcounts[head] += 1
            vowelcounts[vowel] += 1

    headp = {}
    vowelp = {}
    tailp = {}

    with codecs.open('headjamo.txt', encoding='utf-8', mode='r') as f:
        for x in f.read().strip():
            headp[x] = headcounts[x] / realletter if realletter != 0 else 0
    with codecs.open('voweljamo.txt', encoding='utf-8', mode='r') as f:
        for x in f.read().strip():
            vowelp[x] = vowelcounts[x] / realletter if realletter != 0 else 0
    with codecs.open('tailjamo.txt', encoding='utf-8', mode='r') as f:
        for x in f.read().strip():
            tailp[x] = tailcounts[x] / realtail if realtail != 0 else 0
    return (headp, vowelp, tailp)
Exemple #21
0
def EU_irregularOperation(c):
    assert len(c) == 1
    assert isHangul(c)
    # important to check if there even is a tail!
    # otherwise, the following call won't work
    # only for 아,어
    if jamoVowel(c) == 1 or jamoVowel(c) == 5:
        # get the compatibility Jamo
        finalJamo = j2hcj(h2j(c)[-1])
        lead, vowel, tail = jamoLead(c), jamoVowel(c), jamoTail(c)
        # lead null consonant is 12
        # 아->으+아, or 았->으+았
        # 어->으+어, or 었->으+었
        return assembleHangul(lead, 19, 0) + assembleHangul(12, vowel, tail)
    else:
        return c  # null final: nothing to split off
Exemple #22
0
    def _load_dataset(self):

        #         kor_path = '/home/Data/FoodDetection/data/text_recognition/Korean/synthetic_data/data'
        kor_path = '/Data/FoodDetection/data/text_recognition/Korean/synthetic_data/data'

        kor_images_labels = []
        with open(os.path.join(kor_path, 'gt.txt'), 'r') as f:
            files = f.readlines()

        if self.need_samples == None:
            self.need_samples = len(files)
            print(f'{self.need_samples} files will be loaded')

        random_ids = np.random.choice(range(len(files)),
                                      size=self.need_samples,
                                      replace=False)
        for idx, file in enumerate(tqdm(np.asarray(files)[random_ids])):
            try:
                img_path, label = file.split(' ')
                img = os.path.join(kor_path, f'{img_path}.jpg')
                label = label.strip('\n')
                if self.mode == 'jamo':
                    #                     label = j2hcj(h2j(label))

                    top_tmp = []
                    middle_tmp = []
                    bottom_tmp = []

                    for char in label:
                        decomposed = j2hcj(h2j(char))
                        for i, label in enumerate(
                            [top_tmp, middle_tmp, bottom_tmp]):
                            try:
                                label.append(decomposed[i])
                            except:
                                label.append(' ')

                    kor_images_labels.append(
                        [img, top_tmp, middle_tmp, bottom_tmp])
                elif self.mode == 'syllable':
                    kor_images_labels.append([img, label])

            except Exception as e:
                print(e)
                continue
        return kor_images_labels
def find_s(sentence):
    s_table = []  # 주어들만 저장할 테이블
    for k in range(len(sentence)):  # 테이블에 저장된 한 문장 길이 동안
        if ((sentence[k][0] == '가' and sentence[k][1] == 'JKS')
                or (sentence[k][0] == '이' and sentence[k][1] == 'JKS')):
            do_jamo = j2hcj(h2j(sentence[k +
                                         1][0]))  # 뒤에 '되', '돼'가 오면 보어로 처리해야함
            if (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅚ') or \
                    (do_jamo[0] == 'ㄷ' and do_jamo[1] == 'ㅙ'):
                break
            # 가,이 중 주격 조사인 것들에 한해
            cnt = 0
            for m in range(0, k):  # 주격 조사 앞에 있는 것들중
                if (sentence[m][1] == 'NNG' or sentence[m][1] == 'NNP'
                        or sentence[m][1] == 'NNB' or sentence[m][1] == 'NP'):
                    # 명사에 해당 되는 것들 중에
                    cnt = m  # 가장 주격 조사에 가까운 것을
            s_table.append(sentence[cnt])  # 주어라고 저장
            s_table.append(sentence[k])  # 주어 뒤에 조사(확인용)

        if ((sentence[k][0] == '은' and sentence[k][1] == 'JX')
                or (sentence[k][0] == '는' and sentence[k][1] == 'JX')):
            # 은, 는 중 보조사 인것들에 한해
            jks_cnt = -1  # 주격조사count변수
            jx_cnt = -1
            for x in range(len(sentence)):  # 테이블의 i번째 문장 길이동안
                if (sentence[x][1] == 'JKS'):  # jsk(주격 조사가 있으면)
                    jks_cnt += 1  # count변수++
            for jx in range(0, k):
                if ((sentence[jx][0] == '은' and sentence[jx][1] == 'JX') or
                    (sentence[jx][0] == '는' and sentence[jx][1] == 'JX')):
                    jx_cnt += 1
            if (jks_cnt < 0 and jx_cnt < 0):  # 만약 주격 조사가 없으면
                N_cnt = 0
                for z in range(0, k):  # 은, 는 앞에 있는 것들중
                    if (sentence[z][1] == 'NNG' or sentence[z][1] == 'NNP'
                            or sentence[z][1] == 'NNB'
                            or sentence[z][1] == 'NP'):
                        # 명사에 해당 되는 것들 중에
                        N_cnt = z  # 가장 주격 조사에 가까운 것을

                s_table.append(sentence[N_cnt])  # 주어라고 저장
                s_table.append(sentence[k])  # 주어 뒤에 조사(확인용)

    return s_table
    def convert_pdf_to_txt(self, pdf_file):
        """PDF파일을 텍스트로 변환해주는 함수

        Args:
            pdf ([PDF]): PDF파일

        Returns:
            [dict]: PDF에서 텍스트로 변환된 결과물
        """

        output_string = StringIO()
        self.file_nm = pdf_file.split(".")[0]
        file_ex = pdf_file.split(".")[1]

        self.pdf_path = self.report_pdf_dir + pdf_file
        self.pdf_path = hangul.join_jamos(j2hcj(h2j(self.pdf_path)))

        laparams = LAParams(line_overlap=.5,
                            char_margin=1.35,
                            line_margin=1.0,
                            word_margin=0.01,
                            boxes_flow=.5,
                            detect_vertical=False,
                            all_texts=False)

        rsrcmgr = PDFResourceManager()
        device = FinanceConverter(rsrcmgr, output_string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Extract text
        found = False
        with open(self.pdf_path, 'rb') as in_file:

            for page_num, page in enumerate(PDFPage.get_pages(in_file, check_extractable=True)):
                interpreter.process_page(page)
                page_text = output_string.getvalue()
                report_text, found, company_nm, company_num = self.page_text_finder(
                    page_text)
                if found:
                    break

            if not found:
                report_text = None

        return report_text, company_nm, company_num
    def test_j2hcj(self):
        """j2hcj tests
        Arguments may be iterables or single characters.

        j2hcj should convert every U+11xx jamo character into U+31xx HCJ in a
        given input. Anything else is unchanged.
        """

        test_strings = ["", "test123", "ᄀᄁᄂᄃᇹᇫ"]
        target_strings = ["", "test123", "ㄱㄲㄴㄷㆆㅿ"]

        all_tests = itertools.chain(zip(test_strings, target_strings))

        for test, target in all_tests:
            trial = jamo.j2hcj(test)
            assert trial == target,\
                ("Matched {test} to {trial}, but "
                 "expected {target}.").format(test=''.join(test),
                                              trial=trial,
                                              target=target)
Exemple #26
0
    def test_j2hcj(self):
        """j2hcj tests
        Arguments may be iterables or single characters.

        j2hcj should convert every U+11xx jamo character into U+31xx HCJ in a
        given input. Anything else is unchanged.
        """

        test_strings = ["", "test123", "ᄀᄁᄂᄃᇹᇫ"]
        target_strings = ["", "test123", "ㄱㄲㄴㄷㆆㅿ"]

        all_tests = itertools.chain(zip(test_strings, target_strings))

        for test, target in all_tests:
            trial = jamo.j2hcj(test)
            assert trial == target,\
                ("Matched {test} to {trial}, but "
                 "expected {target}.").format(test=''.join(test),
                                              trial=trial,
                                              target=target)
Exemple #27
0
async def on_message(message):
    if message.content.startswith("ㅃ"):
        t = ""
        ctx = message.content
        for i in ctx[1:]:
            if 44032 > ord(i) or ord(i) > 55204:
                t = t + i
            else:
                i = j2hcj(h2j(i))
                i = "ㅃ" + i[1:]
                t = t + join_jamos(i)
        await message.channel.send(f"{message.author.name}:{t}")
    if message.content.startswith("!호에"):
        a = message.content
        result = "호"
        for i in a[3:]:
            s = bin(ord(i))[2:]
            s = s.replace("1", "ㅇ")
            s = s.replace("0", "ㅔ")
            result += (s)
        result = join_jamos(result)
        await message.channel.send(f"{message.author.name}:{result}")
Exemple #28
0
def plot(alignment, info, text, isKorean=True):
    char_len, audio_len = alignment.shape  # 145, 200

    fig, ax = plt.subplots(figsize=(char_len / 5, 5))
    im = ax.imshow(alignment.T,
                   aspect='auto',
                   origin='lower',
                   interpolation='none')

    xlabel = 'Encoder timestep'
    ylabel = 'Decoder timestep'

    if info is not None:
        xlabel += '\n{}'.format(info)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    # plt.legend('19000step',fontsize=15, loc='upper left')

    if text:
        if isKorean:
            jamo_text = j2hcj(h2j(normalize(text)))
        else:
            jamo_text = text
        pad = [PAD] * (char_len - len(jamo_text) - 1)
        A = [tok for tok in jamo_text] + [EOS] + pad
        A = [x if x != ' ' else '' for x in A]  # 공백이 있으면 그 뒤가 출력되지 않는 문제...
        plt.xticks(range(char_len), A)

    if text is not None:
        while True:
            if text[-1] in [EOS, PAD]:
                text = text[:-1]
            else:
                break
        plt.title('90000 step inna \n' + text)
        #plt.title('90000 step kss \n' + text)

    plt.tight_layout()
def insert_dot(text, dot):
    pattern1 = re.compile(r'\S(ㄷㅏ)$')   #다
#     pattern2 = re.compile(r'(.ㅔ|ㅏ|ㅓ|ㅐ|ㅗ|ㅜ)(ㅇㅛ)$') # ㅔ요, ㅏ요, ㅓ요
#     pattern3 = re.compile(r'(ㅆ)(ㅈ|ㅊ)(ㅛ)$') 
#     pattern4 = re.compile(r'(ㅂ|ㅣ)(ㄴㅣㄲㅏ)$')  #ㅂ니까
#     pattern5 = re.compile(r'(ㄴㄷㅔ)$')
    text_list = []
    _1 = 0
    _2 = 0
    _3 = 0
    _4 = 0
    _5 = 0
    _6 = 0
    for _ in text.split(' '):         
        new_ = j2hcj(h2j(_))
        if pattern1.findall(new_):
            text_list.append(_.replace(_, _+dot))
            _1 += 1
#         elif pattern2.findall(new_):
#             text_list.append(_.replace(_, _+dot))
#             _2 += 1
#             list2.append(_)
#         elif pattern3.findall(new_):
#             text_list.append(_.replace(_, _+dot))
#             _3 += 1
#             list3.append(_)
#         elif pattern4.findall(new_):
#             text_list.append(_.replace(_, _+dot))
#             _4 += 1
#         elif pattern5.findall(new_):
#             text_list.append(_.replace(_,_+dot))
#             _5 += 1
        else:
            text_list.append(_)
            _6 += 1
#     print('pattern1 = {}, pattern2= {}, pattern3 = {}, pattern5 = {}, pattern6 = {}'.format(_1,_2,_3,_4,_5,_6)) 
    #print('pattern2 = ',list2)#'\n','list5 = ',list5)
    return text_list    
Exemple #30
0
def sori(text):
    text_list = np.array(list(text))
    text_list = text_list[np.where(text_list!=' ')]
    decompose = pd.Series(text_list).apply(lambda x: j2hcj(h2j(x))).tolist()

    # 끝소리 규칙
    end_sound =  ['ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅇ']
    convert_end_sound = {'ㄲ': 'ㄱㄱ', 'ㄳ': 'ㄱㅅ', 'ㄶ':'ㄴㅎ', 'ㄵ': 'ㄴㅈ', 'ㄺ': 'ㄹㄱ', 'ㄻ': 'ㄹㅁ', 'ㄼ': 'ㄹㅂ', 'ㄽ': 'ㄹㅅ', 'ㄾ': 'ㄹㅌ', 'ㄿ': 'ㄹㅍ', 'ㅀ': 'ㄹㅎ', 'ㅄ': 'ㅂㅅ', 'ㅆ': 'ㅅㅅ'}
    end_simplize = {'ㅅ': 'ㄷ', 'ㅈ': 'ㄷ', 'ㅊ': 'ㄷ', 'ㅋ': 'ㄱ', 'ㅌ': 'ㄷ', 'ㅍ': 'ㅂ', 'ㅎ': 'ㄷ', 'ㅅㅅ':'ㄷ'}
    for idx, word in enumerate(decompose):
        if len(word)==3 and word[-1] in convert_end_sound.keys():
            decompose[idx] = word[:-1] + convert_end_sound[word[-1]]
    for again in range(10):
        for idx in range(len(decompose)-1):
            f_idx = idx
            b_idx = f_idx + 1
            forth, back = decompose[f_idx], decompose[b_idx]
            if (back[0]=='ㅇ' and forth[-2:]=='ㄹㅎ') or (back[0]=='ㅇ' and forth[-2:]=='ㄴㅎ'):
                decompose[f_idx] = forth[:-1]
            if back[0]=='ㅇ' and forth[-1] in end_sound and forth[-1] != 'ㅇ': # jong_sung -> end_sound 수정
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = forth[-1] + back[1:]
            if back[0]=='ㅎ' and forth[-1] == 'ㄱ':
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = 'ㅋ' + back[1:]
            if (back[0]=='ㅈ' and forth[-1] == 'ㄱ') or (back[0]=='ㅈ' and forth[-1]=='ㅂ') or (back[0]=='ㅈ' and forth[-1]=='ㅍ') or (back[0]=='ㅈ' and forth[-1]=='ㄷ'):
                decompose[b_idx] = 'ㅉ' + back[1:]
            if back[0]=='ㅈ' and forth[-2:]=='ㄹㅌ':
                decompose[b_idx] = 'ㅉ' + back[1:]
                decompose[f_idx] = forth[:-1]
            if back[0]=='ㅈ' and forth[-2:]=='ㅅㅅ':
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = 'ㅉ' + back[1:]
            if (back[0]=='ㄷ' and forth[-1]=='ㅅ') or (back[0]=='ㄷ' and forth[-1]=='ㄷ'):
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = 'ㄸ' + back[1:]
            if back[0]=='ㄲ' and forth[-2:]=='ㅅㅅ':
                decompose[f_idx] = forth[:-1]
            if back[0]=='ㄱ' and forth[-1]=='ㅎ':
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = 'ㅋ' + back[1:]
            if (back[0] == 'ㄱ' and forth[-1] == 'ㅅ') or (back[0]=='ㄱ' and forth[-1] == 'ㄱ') or (back[0]=='ㄱ' and forth[-1]=='ㅍ'):
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = 'ㄲ' + back[1:]
            if back[0]=='ㄱ' and forth[-1]=='ㅂ':
                decompose[b_idx] = 'ㄲ' + back[1:]
            if (back[0] == 'ㅅ' and forth[-1] == 'ㅂ') or (back[0]=='ㅅ' and forth[-1]=='ㅅ') or (back[0]=='ㅅ' and forth[-1]=='ㄱ') or (back[0]=='ㅅ' and forth[-1]=='ㄹ')            or (back[0] == 'ㅅ' and forth[-1] == 'ㅍ'):
                decompose[b_idx] = 'ㅆ' + back[1:]
            if back[0]=='ㄷ' and forth[-1]=='ㅎ':
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = 'ㅌ' + back[1:]
            if back[0]=='ㅎ' and forth[-1]=='ㅅ':
                decompose[b_idx] = 'ㅌ' + back[1:]
                decompose[f_idx] = forth[:-1]
            if back[0]=='ㅎ' and forth[-1]=='ㄷ':
                decompose[b_idx] = 'ㅊ' + back[1:]
            if (back[0]=='ㄱ' and forth[-1]=='ㅅ') or (back[0]=='ㄱ' and forth[-1]=='ㄷ'):
                decompose[b_idx] = 'ㄲ' + back[1:]
            if back[0]=='ㅎ' and forth[-1]=='ㅂ':
                decompose[f_idx] = forth[:-1]
                decompose[b_idx] = 'ㅍ' + back[1:]
            if back[0]=='ㅅ' and forth[-2:]=='ㄴㅈ':
                decompose[b_idx] = 'ㅆ' + back[1:]
                decompose[f_idx] = forth[:-1]
            if (back[0]=='ㅈ' and forth[-2:]=='ㄴㅎ') or (back[0]=='ㅈ' and forth[-1]=='ㅎ'):
                decompose[b_idx] = 'ㅊ' + back[1:]
                decompose[f_idx] = forth[:-1]
            if back[0]=='ㄷ' and forth[-1]=='ㄱ':
                decompose[b_idx] = 'ㄸ' + back[1:]
            if back[0]=='ㅂ' and forth[-1]=='ㄱ':
                decompose[b_idx] = 'ㅃ' + back[1:]
            if back[0]=='ㄷ' and forth[-2:]=='ㄹㅁ':
                decompose[b_idx] = 'ㄸ' + back[1:]
                decompose[f_idx] = forth[:-2] + 'ㅁ'
            if back[0]=='ㄷ' and forth[-2:]=='ㄹㅌ':
                decompose[b_idx] = 'ㄸ' + back[1:]
                decompose[f_idx] = forth[:-2] + 'ㄹ'
            if back[0]=='ㄹ' and forth[-1]=='ㄱ':
                decompose[b_idx] = 'ㄴ' + back[1:]
                decompose[f_idx] = forth[:-1] + 'ㅇ'
            if back[0]=='ㄹ' and forth[-1]=='ㄴ':
                decompose[f_idx] = forth[:-1] + 'ㄹ'
            if back[0]=='ㄹ' and forth[-1]=='ㅇ':
                decompose[b_idx] = 'ㄴ' + back[1:]
            if back[0]=='ㅁ' and forth[-1]=='ㄱ':
                decompose[f_idx] = forth[:-1] + 'ㅇ'
            if back[0]=='ㄴ' and forth[-1]=='ㄹ':
                decompose[b_idx] = 'ㄹ' + back[1:]
            if back[0]=='ㅅ' and forth[-1]=='ㄱ':
                decompose[b_idx] = 'ㅆ' + back[1:]
            if back[0]=='ㄹ' and forth[-1]=='ㄱ':
                decompose[f_idx] = forth[:-1] + 'ㅇ'
    for idx, word in enumerate(decompose):
        if len(word)==3 and word[-1] not in end_sound:
            decompose[idx] = word[:-1] + end_simplize[word[-1]]
        elif word[-2:]=='ㅅㅅ':
            decompose[idx] = word[:-2] + 'ㄷ'
        elif word[-2:]=='ㅂㅅ':
            decompose[idx] = word[:-2] + 'ㅂ'
        elif word[-2:]=='ㄴㅎ':
            decompose[idx] = word[:-2] + 'ㄴ'
        elif word[-2:]=='ㄱㅅ':
            decompose[idx] = word[:-2] + 'ㄱ'
        elif word[-2:]=='ㄹㅁ':
            decompose[idx] = word[:-2] + 'ㅁ'
        elif word[-2:]=='ㄹㅂ':
            decompose[idx] = word[:-2] + 'ㅂ'
        elif word[-2:]=='ㄱㄱ':
            decompose[idx] = word[:-2] + 'ㄱ'
        elif word[-2:]=='ㄴㅈ':
            decompose[idx] = word[:-2] + 'ㄴ'
        elif word[-2:]=='ㄹㄱ':
            decompose[idx] = word[:-2] + 'ㄱ'
        elif word[-2:]=='ㄹㅁ':
            decompose[idx] = word[:-2] + 'ㅁ'
        elif word[-2:]=='ㄹㅂ':
            decompose[idx] = word[:-2] + 'ㅂ'
        elif word[-2:]=='ㄹㅅ':
            decompose[idx] = word[:-2] + 'ㄷ'
        elif word[-2:]=='ㄹㅌ':
            decompose[idx] = word[:-2] + 'ㄷ'
        elif word[-2:]=='ㄹㅍ':
            decompose[idx] = word[:-2] + 'ㅂ'
        elif word[-2:]=='ㄹㅎ':
            decompose[idx] = word[:-2] + 'ㄹ'
            
    return decompose
Exemple #31
0
def decompose(s):
    return jamo.j2hcj(jamo.h2j(s))
Exemple #32
0
 def tokenize(self, sentence):
     tokenized_sentence = [j for j in j2hcj(h2j(sentence))]
     return tokenized_sentence
import sys
from jamo import h2j, j2hcj
from collections import Counter

string = sys.stdin.readline().strip()

divided = []
for x in j2hcj(h2j(string)):
    divided.append(x)

counts = Counter()
for letter in divided:
    counts[letter] += 1

print(counts)