Esempi in Python per decompose, esempi in Python per soynlp.hangle.decompose

Esempio n. 1

0

Mostra file

 def check_suffix_prefix(stem, eomi):
     l = decompose(stem[-1])
     r = decompose(eomi[0])
     jongcho_l = set('ㄹㅂ')
     jongcho_r = set('ㄴㄹㅁㅂ')
     if (l[2] in jongcho_l) and (r[0] in jongcho_r):
         return False
     if (l[1] == 'ㅡ' and l[2] == ' ' and r[0] == 'ㅇ' and (r[1] == 'ㅓ' or r[1] == 'ㅏ')):
         return False
     return True

Esempio n. 2

0

Mostra file

File: _normalizer.py Progetto: tobby2002/soynlp

def emoticon_normalize(sent, n_repeats=2):
    if not sent:
        return sent
    
    # Pattern matching ㅋ쿠ㅜ
    def pattern(idx):
        # Jaum: 0, Moum: 1, Complete: 2, else -1
        if 12593 <= idx <= 12622:
            return 0
        elif 12623 <= idx <= 12643:
            return 1
        elif 44032 <= idx <= 55203:
            return 2
        else:
            return -1

    idxs = [pattern(ord(c)) for c in sent]
    sent_ = []
    for i, (idx, c) in enumerate(zip(idxs[:-1], sent)):
        if i > 0 and (idxs[i-1] == 0 and idx == 2 and idxs[i+1] == 1):
            cho, jung, jong = decompose(sent[i])
            if (cho == sent[i-1]) and (jung == sent[i+1]) and (jong == ' '):
                sent_.append(cho)
                sent_.append(jung)
            else:
                sent_.append(c)
        else:
            sent_.append(c)
    sent_.append(sent[-1])
    return repeat_normalize(''.join(sent_), n_repeats)

Esempio n. 3

0

Mostra file

File: _normalizer.py Progetto: owlur/soynlp

def emoticon_normalize(sent, num_repeats=2):
    if not sent:
        return sent

    # Pattern matching ㅋ쿠ㅜ
    def pattern(idx):
        # Jaum: 0, Moum: 1, Complete: 2, else -1
        if 12593 <= idx <= 12622:
            return 0
        elif 12623 <= idx <= 12643:
            return 1
        elif 44032 <= idx <= 55203:
            return 2
        else:
            return -1

    idxs = [pattern(ord(c)) for c in sent]
    sent_ = []
    for i, (idx, c) in enumerate(zip(idxs[:-1], sent)):
        if i > 0 and (idxs[i - 1] == 0 and idx == 2 and idxs[i + 1] == 1):
            cho, jung, jong = decompose(sent[i])
            if (cho == sent[i - 1]) and (jung == sent[i + 1]) and (jong
                                                                   == ' '):
                sent_.append(cho)
                sent_.append(jung)
            else:
                sent_.append(c)
        else:
            sent_.append(c)
    sent_.append(sent[-1])
    return repeat_normalize(''.join(sent_), num_repeats)

Esempio n. 4

0

Mostra file

 def transform(char):
     if char == ' ':
         return char
     cjj = decompose(char)
     if len(cjj) == 1:
         return cjj
     cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
     return cjj_

Esempio n. 5

0

Mostra file

File: transform_data.py Progetto: dandyhug/kookey

    def transform(char):
        if char == " ":
            return char

        cjj = decompose(char)
        if len(cjj) == 1:
            return cjj

        cjj_ = "".join(c if c != " " else "-" for c in cjj)
        return cjj_

Esempio n. 6

0

Mostra file

File: utils.py Progetto: SaewonY/Flask_Pytorch_Server

 def transform(char):
     if char == ' ':
         return char
     cjj = decompose(char)
     try:
         len(cjj)
     except:
         return ' '
     if len(cjj) == 1:
         return cjj
     cjj_ = ''.join(c if c != ' ' else '' for c in cjj)
     return cjj_

Esempio n. 7

0

Mostra file

    def _post_processing(self, eomis):
        eomis_ = {}
        for eomi, score in eomis.items():
            # 어미의 첫 종성이 ㅎ 으로 끝나는 경우
            if decompose(eomi[0])[2] == 'ㅎ':
                continue
            # TODO
            # Remove E + V + E : -서가지고
            # Remove V + E : -싶구나
            eomis_[eomi] = score

        return eomis_

Esempio n. 8

0

Mostra file

File: _noun_news.py Progetto: tobby2002/soynlp

 def _hardrule_dang_hada_filter(self, l, max_h_proportion=0.5):
     from soynlp.hangle import decompose # TODO check import path
     if not (l[-1] == '당') and (l[:-1] in self._noun_scores_ or l[:-1] in self.noun_dictionary):
         return True
     rdict = self.lrgraph.get(l, {})
     n_base = sum((c for r,c in rdict.items() if c))
     n_h = 0
     for r,c in rdict.items():
         if not r: continue
         rdecompose = decompose(r[0])
         if rdecompose and rdecompose[0] == 'ㅎ':
             n_h += c
     return True if n_base <= 0 else (n_h / n_base < max_h_proportion)

Esempio n. 9

0

Mostra file

 def _hardrule_dang_hada_filter(self, l, max_h_proportion=0.5):
     from soynlp.hangle import decompose # TODO check import path
     if not (l[-1] == '당') and (l[:-1] in self._noun_scores_ or l[:-1] in self.noun_dictionary):
         return True
     rdict = self.lrgraph.get(l, {})
     n_base = sum((c for r,c in rdict.items() if c))
     n_h = 0
     for r,c in rdict.items():
         if not r: continue
         rdecompose = decompose(r[0])
         if rdecompose and rdecompose[0] == 'ㅎ':
             n_h += c
     return True if n_base <= 0 else (n_h / n_base < max_h_proportion)

Esempio n. 10

0

Mostra file

File: basic_test.py Progetto: tobby2002/soynlp

def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein
    
    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))
    
    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))
    
    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))
    
    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))
    
    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(compose('ㄱ', 'ㅏ', 'ㅁ')))
    
    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(character_is_korean('감')))
    
    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(character_is_korean('a')))
    
    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(character_is_jaum('ㅋ')))
    
    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(character_is_jaum('ㅗ')))
    
    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))
    
    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(levenshtein('가나', '가남')))
    
    if 0.1 != levenshtein('가나', '가남', {('나', '남'):0.1}):
        raise ValueError("levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(levenshtein('가나', '가남', {('나', '남'):0.1})))
    
    if 1/3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(jamo_levenshtein('가나', '가남')))
    
    print('all hangle tests have been successed\n\n')

Esempio n. 11

0

Mostra file

File: _adjective_vs_verb.py Progetto: zeroday0619/soynlp

def conjugate_as_imperative(stem):
    """기본형을 명령형으로 활용하여 말이 되면 동사, 아니면 형용사
    먹다 -> 먹어라, 파랗다 -> 파래라 (o)
    먹다 -> 먹어, 파랗다 -> 파래 (x) 상태를 나타내는 '-어'는 혼동될 수 있음
    """

    eomis_0 = ['어라']
    eomis_1 = ['아라']

    cho, jung, jong = decompose(stem[-1])
    if jung == 'ㅓ' or jung == 'ㅕ':
        return _conjugate(stem, eomis_0)
    else:
        return _conjugate(stem, eomis_1)

Esempio n. 12

0

Mostra file

 def transform(char):
     if char == ' ':
         return char
     # 자모로 자르는 데 문제는 영어일 경우
     if (is_korean(char)):
         cjj = decompose(char)
     elif (is_english(char)):
         cjj = (char)
     else:
         return (' ')
     if len(cjj) == 1:
         return cjj
     cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
     return cjj_

Esempio n. 13

0

Mostra file

def conjugate_chat(stem, ending, enforce_moum_harmoney=False, debug=False):
    if not ending:
        return {stem}

    candidates = conjugate(stem, ending, enforce_moum_harmoney, debug)

    l_len = len(stem)
    l_last = list(decompose(stem[-1]))
    l_last_ = stem[-1]
    r_first = list(decompose(ending[0]))

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if r_first[1] == ' ' and r_first[0] != ' ':
        l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if r_first[1] != ' ':
            candidates.add(stem + ending)
        if debug:
            print('어미의 첫 글자가 자음인 경우: {}'.format(surface))

    return candidates

Esempio n. 14

0

Mostra file

File: _adjective_vs_verb.py Progetto: zeroday0619/soynlp

def conjugate_as_present(stem):
    """기본형을 현재형으로 활용하여 말이 되면 동사, 아니면 형용사
    먹다 -> 먹는다, 파랗다 -> 파란다 (o)
    먹다 -> 먹는다고, 파랗다 -> 파란다고 (o) # -다고* 가 붙은 모든 어미
    먹다 -> 먹는, 파랗다 -> 파란 (x) 상태를 나타내는 '-는'은 혼동될 수 있음
    """

    eomis_0 = ['ㄴ다', 'ㄴ다고', '고있는']
    eomis_1 = ['는다', '는다고', '고있는']

    cho, jung, jong = decompose(stem[-1])
    if jong == ' ':
        return _conjugate(stem, eomis_0)
    else:
        return _conjugate(stem, eomis_1)

Esempio n. 15

0

Mostra file

 def process(c):
     if re.compile('[0-9|a-z|A-Z|.?!]+').match(c):
         return '-' + c + '-'
     jamo = decompose(c)
     # 'a' or 모음 or 자음
     if (jamo is None):
         return ' '
     cho, jung, jong = (jamo)
     if jong == ' ':
         jong = '-'
         if jung == ' ':
             return '-' + cho + '-'
         else:
             if cho == ' ':
                 cho = '-'
     return cho + jung + jong

Esempio n. 16

0

Mostra file

File: _normalizer.py Progetto: parksjin01/soynlp-2.7

def _normalize_emoji(token):
    if len(token) <= 1:
        return token
    token_ = []
    decomposeds = [decompose(c) for c in token]
    for char, cd, nd in zip(token, decomposeds, decomposeds[1:]):
        if cd == None or nd == None:
            token_.append(char)
            continue
        # 앜ㅋㅋㅋㅋ -> 아ㅋㅋㅋㅋㅋ
        if (nd[1] == ' ') and (cd[2] == nd[0]):
            token_.append(compose(cd[0], cd[1], ' ') + nd[0])
        # ㅋ쿠ㅜㅜ -> ㅋㅋㅜㅜㅜ
        elif (cd[2] == ' ') and (nd[0] == ' ') and (cd[1] == nd[1]):
            token_.append((cd[0] + cd[1]) if cd [0] != ' ' else cd[1])
        else:
            token_.append(char)
    return ''.join(token_) + token[-1]

Esempio n. 17

0

Mostra file

    def decode(self, input, opt = 'morphs'):
        if opt == 'morphs': temp = mecab.morphs(input)
        elif opt == 'noun': temp = mecab.nouns(input)
        elif opt == 'space': temp = input.split()
        result = [] ; for_sample = round(len(input)/3, 0)

        
        for idx, word in enumerate(temp):
            if re.match('[ㄱ-ㅣ가-힣]', word) == None and type(word) != None: # 1) 한글인가? 2) Nonetype이 아닌가?
                result.append(word)
                if self.sample == True and idx % for_sample == 0: print(f'>>>> Decode 안됨: {word}')
            
            else:
                for idxx, letter in enumerate(word):
                    temp1 = decompose(letter)
                    result.append(temp1)
                    if self.sample == True and idxx % for_sample == 0 : print(f'>>>> Decode: {temp1}')
        
        return result

Esempio n. 18

0

Mostra file

File: _lemmatizer.py Progetto: zeroday0619/soynlp

def lemma_candidate_chat(l, r, predefined=None, debug=False):
    def add_lemma(stem, ending):
        candidates.add((stem, ending))

    def character_is_emoticon(c):
        return c in set('ㄷㅂㅅㅇㅋㅎ')

    candidates = lemma_candidate(l, r, predefined, debug)
    l_last = decompose(l[-1])

    # 어미가 ㄷ, ㅂ, ㅅ, ㅇ, ㅋ, ㅎ 일 경우,
    # (아닏, 아닙, 아닛, 아닝, 아닠, 아닣)
    # (그랟, 그랩, 그랫, 그랭, 그랰, 그랳)
    if not r and character_is_emoticon(l_last[2]):
        l_ = l[:-1] + compose(l_last[0], l_last[1], ' ')
        if debug:
            debug_message('마지막 종성이 이모티콘으로 의심되는 경우', l_, '()')
        candidates.update(lemma_candidate(l_, r, predefined, debug))

    return candidates

Esempio n. 19

0

Mostra file

File: _lemmatizer.py Progetto: tobby2002/soynlp

    def _candidates(self, l, r):
        candidates = {(l, r)}

        l_last = decompose(l[-1])
        l_last_ = compose(l_last[0], l_last[1], ' ')
        r_first = decompose(r[0]) if r else ('', '', '')
        r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '

        # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
        if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㄷ')
            candidates.add((l_root, r))

        # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
        if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ') + '르'
            r_canon = compose('ㅇ', r_first[1], r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

        # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
        if (l_last[2] == ' ') and (r_first_ == '워' or r_first_ == '와'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅂ')
            r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ', r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

#         # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅅ)
#         # 입 + 니다 -> 이 + ㅂ니다
        if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ')
            r_canon = l_last[2] + r
            candidates.add((l_root, r_canon))

#         # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
#         # exception : 벗 + 어 -> 벗어
        if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅅ')
            candidates.add((l_root, r))

        # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
        if l_last_ == '퍼':
            l_root = l[:-1] + '푸'
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
        if l_last[1] == 'ㅝ':
            l_root = l[:-1] + compose(l_last[0], 'ㅜ', ' ')
            r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
        if l_last[1] == 'ㅘ':
            l_root = l[:-1] + compose(l_last[0], 'ㅗ', ' ')
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
        if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅡ', ' ')
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 거라, 너라 불규칙 활용
        # '-거라/-너라'를 어미로 취급하면 규칙 활용
        # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
        #    # TODO

        # 러 불규칙 활용: 이르 + 러 -> 이르다
        # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
        #     if self.is_root(l):
        #         # TODO

        # 여 불규칙 활용
        # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

        # 여 불규칙 활용 (2)
        # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
        if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
            l_root = l[:-1] + '하'
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅎ (탈락) 불규칙 활용
        # 파라 + 면 -> 파랗 + 면
        if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            candidates.add((l_root, r_canon))

        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_root = l[:-1] + '렇'
            else:
                l_root = l[:-1] + compose(l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        ## Pre-defined set
        if (l, r) in self._predefined:
            for root in self._predefined[(l, r)]:
                candidates.add(root)

        return candidates

Esempio n. 20

0

Mostra file

File: _conjugation.py Progetto: tobby2002/soynlp

def _conjugate_root(root):

    l_len = len(root)
    l_last = decompose(root[-1])
    l_last_ = root[-1]

    candidates = {root}

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ':
        l = root[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and l_len >= 2:
        c0, c1, c2 = decompose(root[-2])
        l = root[:-2] + compose(c0, c1, 'ㄹ')
        candidates.add(l)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ'):
        l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ':
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == 'ㅅ') and root[-1] != '벗':
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], ' '))

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            candidates.add(root[:-1] + compose(l_last[0], 'ㅝ', ' '))
            candidates.add(root[:-1] + compose(l_last[0], 'ㅝ', 'ㅆ'))

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ':
        candidates.add(root[:-1] + compose(l_last[0], 'ㅘ', ' '))
        candidates.add(root[:-1] + compose(l_last[0], 'ㅘ', 'ㅆ'))

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트'):
        candidates.add(root[:-1] + compose(l_last[0], 'ㅓ', ' '))
        candidates.add(root[:-1] + compose(l_last[0], 'ㅓ', 'ㅆ'))

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    # 하 + 았다 -> 했다
    if l_last_ == '하':
        candidates.add(root[:-1] + '해')
        candidates.add(root[:-1] + '했')

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], ' '))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        # candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(root[:-1] + compose(l_last[0], 'ㅐ', 'ㅆ'))
        # candidates.add(root[:-1] + compose(l_last[0], 'ㅔ', 'ㅆ'))

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음

    return candidates

Esempio n. 21

0

Mostra file

File: _lemmatizer.py Progetto: songys/soynlp

def _lemma_candidate(l, r, predefined=None):
    def add_lemma(stem, ending):
        candidates.add((stem, ending))

    candidates = {(l, r)}

    l_last = decompose(l[-1])
    l_last_ = compose(l_last[0], l_last[1], ' ')
    l_front = l[:-1]
    r_first = decompose(r[0]) if r else ('', '', '')
    r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '
    r_end = r[1:]

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
    if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㄷ')
        add_lemma(l_stem, r)

    # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
    if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
        l_stem = l_front + compose(l_last[0], l_last[1], ' ') + '르'
        r_canon = compose('ㅇ', r_first[1], r_first[2]) + r_end
        add_lemma(l_stem, r_canon)

    # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
    if (l_last[2] == ' ') and (r_first_ == '워' or r_first_ == '와'):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅂ')
        r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ',
                          r_first[2]) + r_end
        add_lemma(l_stem, r_canon)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅅ)
    # 입 + 니다 -> 이 + ㅂ니다
    if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[
            2] == 'ㅆ':
        l_stem = l_front + compose(l_last[0], l_last[1], ' ')
        r_canon = l_last[2] + r
        add_lemma(l_stem, r_canon)

    # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅅ')
        add_lemma(l_stem, r)

    # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
    if l_last_ == '퍼':
        l_stem = l_front + '푸'
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
    if l_last[1] == 'ㅝ':
        l_stem = l_front + compose(l_last[0], 'ㅜ', ' ')
        r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
    if l_last[1] == 'ㅘ':
        l_stem = l_front + compose(l_last[0], 'ㅗ', ' ')
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
    if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
        l_stem = l_front + compose(l_last[0], 'ㅡ', ' ')
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
    #    # TODO

    # 러 불규칙 활용: 이르 + 러 -> 이르다
    # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
    #     if self.is_stem(l):
    #         # TODO

    # 여 불규칙 활용
    # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

    # 여 불규칙 활용 (2)
    # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
    if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
        l_stem = l_front + '하'
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # ㅎ (탈락) 불규칙 활용
    if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'
            or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
        # 파라 + 면 -> 파랗 + 면
        if (l_last[1] == 'ㅏ' or l_last[1] == 'ㅓ'):
            l_stem = l_front + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            add_lemma(l_stem, r_canon)
        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_stem = l_front + '렇'
            else:
                l_stem = l_front + compose(
                    l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ',
                              l_last[2]) + r
            add_lemma(l_stem, r_canon)

    # 이었 -> 였 규칙활용
    # 좋아졌 + 어 -> 좋아지 + 었어, 좋아졋 + 던 -> 좋아지 + 었던
    # 종성 ㅆ 을 ㅅ 으로 쓰는 경우도 고려
    if ((l_last[0] != 'ㅇ') and
        (l_last[2] == 'ㅆ' or l_last[2] == 'ㅅ' or l_last[2] == ' ') and
        (l_last[1] == 'ㅕ') or (l_last[1] == 'ㅓ')):

        l_stem = l_front + compose(l_last[0], 'ㅣ', ' ')
        r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    ## Pre-defined set
    if predefined and (l, r) in predefined:
        for stem in predefined[(l, r)]:
            candidates.add(stem)

    return candidates

Esempio n. 22

0

Mostra file

File: _conjugation.py Progetto: songys/soynlp

def _conjugate_stem(stem):

    l_len = len(stem)
    l_last = decompose(stem[-1])
    l_last_ = stem[-1]

    candidates = {stem}

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ':
        l = stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and l_len >= 2:
        c0, c1, c2 = decompose(stem[-2])
        l = stem[:-2] + compose(c0, c1, 'ㄹ')
        candidates.add(l)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == 'ㅅ') and stem[-1] != '벗':
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], ' '))

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            candidates.add(stem[:-1] + compose(l_last[0], 'ㅝ', ' '))
            candidates.add(stem[:-1] + compose(l_last[0], 'ㅝ', 'ㅆ'))

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅘ', ' '))
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅘ', 'ㅆ'))

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트'):
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅓ', ' '))
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅓ', 'ㅆ'))

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    # 하 + 았다 -> 했다
    if l_last_ == '하':
        candidates.add(stem[:-1] + '해')
        candidates.add(stem[:-1] + '했')

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], ' '))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        # candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅐ', 'ㅆ'))
        # candidates.add(stem[:-1] + compose(l_last[0], 'ㅔ', 'ㅆ'))

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음

    # 이었 -> 였 규칙활용
    if l_last[1] == 'ㅣ' and l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅕ', 'ㅆ'))

    return candidates

Esempio n. 23

0

Mostra file

def pos(s, remove_tag=[], c_tag=[], _opt = '-tip1+sw', _in = 'sample.in', _out = 'sample.out', thread=True):
    try:
        c_tag = c_tags
        # -- 멀티 세션 돌릴때 파일이름 겹치면 오류발생함
        if thread:
            def _idGenerator():
                return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6))
            _in = _idGenerator()+'.in'
            _out = _idGenerator()+'.out'

        _i = gl('*.in')
        if len(_i):
            for i in _i:
                os.remove(i)
        _o = gl('*.out')
        if len(_o):
            for o in _o:
                os.remove(o)
        # 50 단어 마다 줄바꿈 ( 안하면 KLT에서 WARNING 발생 )
        words = s.split()
        s = [ ' '.join(words[i:i + 50]) for i in range(0, len(words), 50) ]
        s = '\n'.join(s)
        f = codecs.open(_in, 'w+', encoding='KSC5601')
        f.write(s)
        f.close()

        command = ["kma.exe",_opt,_in,_out]
        check_call(command, stdout=DEVNULL, stderr=STDOUT)



        os.remove(_in)   # 파일 지우기

        f = codecs.open(_out, encoding='KSC5601')
        tokend_text = f.read()
        f.close()

        os.remove(_out)  # 파일 지우기

        str_token = re.findall(pattern='\([\w ]+\)', string=tokend_text)
        poses = list(map(_parse, str_token))

        # -- 불용태그 제거
        if len(remove_tag):
            poses = [(w,t) for w,t in poses if t not in remove_tag ]

        chunker = RegexpParser('JOSA:{<t|c><e>}')
        chunks  = chunker.parse(poses)
        chunks  = [chunk.leaves() if type(chunk) != tuple else chunk for chunk in chunks]
        poses = []
        for pos in chunks:
            if type(pos) == list:
                w1, t1 = pos[0]
                jong, t2 = pos[1]
                try:
                    chojung = decompose(w1)
                    w = compose(chojung[0], chojung[1], jong)
                except:
                    w = w1+jong

                if w1 == '하' and jong == '어':
                    w = '해'
                pos = (w, t1+t2)
            for org,cus in c_tag:
                if org == pos:
                    pos = cus
            poses.append(pos)

        # 불용어 제거
        stop_words = [('의','N'),('을','N'),('를','N'),('대한','N'),('인해','N'),('중','N'),('등','N')]
        poses = [ pos for pos in poses if pos not in stop_words]
        return poses
    except:
        return []

Esempio n. 24

0

Mostra file

    def _candidates(self, l, r):
        candidates = {(l, r)}

        l_last = decompose(l[-1])
        l_last_ = compose(l_last[0], l_last[1], ' ')
        r_first = decompose(r[0]) if r else ('', '', '')
        r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '

        # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
        if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㄷ')
            candidates.add((l_root, r))

        # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
        if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ') + '르'
            r_canon = compose('ㅇ', r_first[1], r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

        # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
        if (l_last[2] == ' ') and (r_first_ == '워' or r_first_ == '와'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅂ')
            r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ',
                              r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

#         # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅅ)
#         # 입 + 니다 -> 이 + ㅂ니다
        if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[
                2] == 'ㅆ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ')
            r_canon = l_last[2] + r
            candidates.add((l_root, r_canon))

#         # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
#         # exception : 벗 + 어 -> 벗어
        if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅅ')
            candidates.add((l_root, r))

        # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
        if l_last_ == '퍼':
            l_root = l[:-1] + '푸'
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
        if l_last[1] == 'ㅝ':
            l_root = l[:-1] + compose(l_last[0], 'ㅜ', ' ')
            r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
        if l_last[1] == 'ㅘ':
            l_root = l[:-1] + compose(l_last[0], 'ㅗ', ' ')
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
        if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅡ', ' ')
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 거라, 너라 불규칙 활용
        # '-거라/-너라'를 어미로 취급하면 규칙 활용
        # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
        #    # TODO

        # 러 불규칙 활용: 이르 + 러 -> 이르다
        # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
        #     if self.is_root(l):
        #         # TODO

        # 여 불규칙 활용
        # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

        # 여 불규칙 활용 (2)
        # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
        if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
            l_root = l[:-1] + '하'
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅎ (탈락) 불규칙 활용
        # 파라 + 면 -> 파랗 + 면
        if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'
                or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            candidates.add((l_root, r_canon))

        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_root = l[:-1] + '렇'
            else:
                l_root = l[:-1] + compose(
                    l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ',
                              l_last[2]) + r
            candidates.add((l_root, r_canon))

        ## Pre-defined set
        if (l, r) in self._predefined:
            for root in self._predefined[(l, r)]:
                candidates.add(root)

        return candidates

Esempio n. 25

0

Mostra file

File: basic_test.py Progetto: ysseo91/soynlp

def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein

    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))

    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))

    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))

    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))

    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(
            compose('ㄱ', 'ㅏ', 'ㅁ')))

    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(
            character_is_korean('감')))

    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(
            character_is_korean('a')))

    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(
            character_is_jaum('ㅋ')))

    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(
            character_is_jaum('ㅗ')))

    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(
            levenshtein('가나', '가남')))

    if 0.1 != levenshtein('가나', '가남', {('나', '남'): 0.1}):
        raise ValueError(
            "levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(
                levenshtein('가나', '가남', {('나', '남'): 0.1})))

    if 1 / 3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(
            jamo_levenshtein('가나', '가남')))

    print('all hangle tests have been successed\n')

Esempio n. 26

0

Mostra file

File: _lemmatizer.py Progetto: parksjin01/soynlp-2.7

    def _candidates(self, l, r):
        candidates = set()
        if self.is_root(l):
            candidates.add(l + '다')

        l_last = decompose(l[-1])
        l_last_ = compose(l_last[0], l_last[1], ' ')
        r_first = decompose(r[0]) if r else ('', '', '')
        r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '

        ## 1. 어간이 바뀌는 불규칙 활용
        # 1.1. ㄷ 불규칙 활용: 깨닫 + 아 -> 깨달아
        if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㄷ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.2. 르 불규칙 활용: 굴 + 러 -> 구르다
        if (l_last[2] == 'ㄹ') and (r_first_ == '러' or (r_first_ == '라')):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ') + '르'
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.3. ㅂ 불규칙 활용: 더러 + 워서 -> 더럽다
        if (l_last[2] == ' ') and (r_first_ == '워'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅂ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.3. ㅂ 불규칙 활용: 도 + 왔다 -> 돕다
        if (l == '도' or l == '고') and (r_first_ == '와'):
            l_root = compose(l_last[0], l_last[1], 'ㅂ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.3. (추가) ㅂ 추가 불규칙: 입 + 니다 -> 이다, 합 + 니다 -> 하다
        if l_last[2] == 'ㅂ':
            l_root = compose(l_last[0], l_last[1], ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.4. ㅅ 불규칙 활용: 부 + 었다 -> 붓다
        if (l_last[2] == ' ') and (r_first[0] == 'ㅇ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅅ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.5. 우 불규칙 활용: 똥퍼 + '' -> 똥푸다
        if l_last_ == '퍼':
            l_root = l[:-1] + '푸'
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.5. 우 불규칙 활용: 줬 + 어 -> 주다
        if l_last[1] == 'ㅝ':
            l_root = l[:-1] + compose(l_last[0], 'ㅜ', ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.6. ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄다 / 텄 + 어 -> 트다
        if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅡ', ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        ## 2. 어미가 바뀌는 불규칙 활용
        # 2.1. 거라 불규칙 활용
        if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
            candidates.add(l + '다')

        # 2.2. 너라 불규칙 활용
        # 2.2.1: 규칙활용: 돌아오 + 너라 -> 돌아오다, 돌아오 + 라고 -> 돌아오다
        # 2.2.2: 돌아 + 왔다 -> 돌아오다
        if (l_last[1] == 'ㅘ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅗ', ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 2.3. 러 불규칙 활용: 이르 + 러 -> 이르다
        if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
            if self.is_root(l):
                candidates.add(l + '다')

        # 2.4. 여 불규칙 활용
        # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 넣으면 되는 문제

        # 2.5. 오 불규칙 활용
        # 달 + 아라 -> 다오, 걸 + 어라 -> 거오: 문어체적 표현에 자주 등장하며 구어체에서는 거의 없음
        # 생략

        ## 3. 어간과 어미가 모두 바뀌는 불규칙 활용
        # 3.1. ㅎ 불규칙 활용
        # 3.1.1: 파라 + 면 -> 파랗다
        if (l_last[2] == ' '):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 3.1.2. 시퍼렜 + 다 -> 시퍼렇다, 파랬 + 다 -> 파랗다, 파래 + '' -> 파랗다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            l_root = l[:-1] + compose(l_last[0],
                                      'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # (추가) 3.2 어미가 ㄴ인 경우: 간 + '' -> 가다, 푸른 + '' -> 푸르다,
        # 한 + '' -> 하다, 이른 + '' -> 이르다
        if (not r) and (l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')
            # 노란 -> 노랗다
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        ## Pre-defined set
        if l + r in self._predefined:
            for root in self._predefined[l + r]:
                candidates.add(root)

        return candidates

Esempio n. 27

0

Mostra file

File: _lemmatizer.py Progetto: zeroday0619/soynlp

def lemma_candidate(l, r, predefined=None, debug=False):
    def add_lemma(stem, ending):
        candidates.add((stem, ending))

    candidates = {(l, r)}
    word = l + r

    l_last = decompose(l[-1])
    l_last_ = compose(l_last[0], l_last[1], ' ')
    l_front = l[:-1]
    r_first = decompose(r[0]) if r else ('', '', '')
    r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '
    r_end = r[1:]

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
    if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㄷ')
        add_lemma(l_stem, r)
        if debug:
            debug_message('ㄷ 불규칙 활용', l_stem, r)

    # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
    if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
        l_stem = l_front + compose(l_last[0], l_last[1], ' ') + '르'
        r_canon = compose('ㅇ', r_first[1], r_first[2]) + r_end
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('르 불규칙 활용', l_stem, r_canon)

    # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
    if (l_last[2] == ' '):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅂ')
        if (r_first_ == '워' or r_first_ == '와'):
            r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ',
                              r_first[2] if r_first[2] else ' ') + r_end
        elif (r_end and r_end[0] == '려'):
            r_canon = compose('ㅇ', 'ㅜ',
                              r_first[2] if r_first[2] else ' ') + r_end
        else:
            r_canon = r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('ㅂ 불규칙 활용', l_stem, r_canon)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅁ-, -ㅂ, -ㅆ)
    # 입 + 니다 -> 이 + ㅂ니다
    if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅁ' or l_last[
            2] == 'ㅂ' or l_last[2] == 'ㅆ':
        for jongsung in ' ㄹㅂㅎ':
            if l_last[2] == jongsung:
                continue
            l_stem = l_front + compose(l_last[0], l_last[1], jongsung)
            r_canon = l_last[2] + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('어미의 첫글자가 종성일 경우 (%s)' % jongsung, l_stem,
                              r_canon)

    # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅅ')
        add_lemma(l_stem, r)
        if debug:
            debug_message('ㅅ 불규칙 활용', l_stem, r)

    # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
    if l_last_ == '퍼':
        l_stem = l_front + '푸'
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('우 불규칙 활용 (퍼)', l_stem, r_canon)

    # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
    if l_last[1] == 'ㅝ':
        l_stem = l_front + compose(l_last[0], 'ㅜ', ' ')
        r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('우 불규칙 활용', l_stem, r_canon)

    # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
    if l_last[1] == 'ㅘ':
        l_stem = l_front + compose(l_last[0], 'ㅗ', ' ')
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('오 불규칙 활용', l_stem, r_canon)

    # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
    if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
        l_stem = l_front + compose(l_last[0], 'ㅡ', ' ')
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('ㅡ 탈락 불규칙 활용 (꺼)', l_stem, r_canon)

    # ㅡ 탈락 불규칙 활용: 모 + 았다 -> 모으 + 았다
    if l_last[2] == ' ' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ'
                                                   or r_first[1] == 'ㅓ'):
        l_stem = l + '으'
        r_canon = r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('ㅡ 탈락 불규칙 활용 (모으)', l_stem, r_canon)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
    #    # TODO

    # 러 불규칙 활용: 이르 + 러 -> 이르다
    # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
    #     if self.is_stem(l):
    #         # TODO

    # 여 불규칙 활용
    # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

    # 여 불규칙 활용 (2)
    # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
    if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
        l_stem = l_front + '하'
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('여 불규칙 활용', l_stem, r_canon)

    # ㅎ (탈락) 불규칙 활용
    if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'
            or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
        # 파라 + 면 -> 파랗 + 면
        if (l_last[1] == 'ㅏ' or l_last[1] == 'ㅓ'):
            l_stem = l_front + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('ㅎ 탈락 불규칙 활용', l_stem, r_canon)
        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_stem = l_front + '렇'
            else:
                l_stem = l_front + compose(
                    l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ',
                              l_last[2]) + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('ㅎ 축약 불규칙 활용', l_stem, r_canon)

    # 이었 -> 였 규칙활용
    # 좋아졌 + 어 -> 좋아지 + 었어, 좋아졋 + 던 -> 좋아지 + 었던, 좋아져 + 서 -> 좋아지 + 어서
    # 였 + 어 -> 이 + 었어
    # 종성 ㅆ 을 ㅅ 으로 쓰는 경우도 고려 (자주 등장하는 맞춤법 오류)
    if ((l_last[2] == 'ㅆ' or l_last[2] == 'ㅅ' or l_last[2] == ' ')
            and (l_last[1] == 'ㅕ')):

        # except: -었 -> 이 + 었 (x) // -였-> 이 + 었 (o) // -졌 -> 지 + 었 (o) // -젔 -> 지 + 었
        if ((l_last[0] == 'ㅇ') and
            (l_last[1] == 'ㅕ')) or not (l_last[0] == 'ㅇ'):
            l_stem = l_front + compose(l_last[0], 'ㅣ', ' ')
            r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('이었 -> 였 규칙 활용', l_stem, r_canon)

    ## Pre-defined set
    if predefined and (l, r) in predefined:
        for stem in predefined[(l, r)]:
            candidates.add(stem)
            if debug:
                debug_message('Predefined', l_stem, r_canon)

    # check whether lemma is conjugatable
    candidates_ = set()
    for stem, eomi in candidates:
        if not eomi:
            continue
        # hard rule
        if decompose(eomi[0])[2] == 'ㅎ':
            continue
        surfaces = conjugate(stem, eomi)
        if word in surfaces:
            candidates_.add((stem, eomi))
    return candidates_

Esempio n. 28

0

Mostra file

File: _conjugation.py Progetto: songys/soynlp

def conjugate(stem, ending):

    assert ending # ending must be inserted

    l_len = len(stem)
    l_last = decompose(stem[-1])
    l_last_ = stem[-1]
    r_first = decompose(ending[0])
    r_first_ = compose(r_first[0], r_first[1], ' ') if r_first[1] != ' ' else ending[0]

    candidates = set()
    
    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ' and r_first[0] == 'ㅇ':
        l = stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l + ending)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and (r_first_ == '아' or r_first_ == '어') and l_len >= 2:
        c0, c1, c2 = decompose(stem[-2])
        l = stem[:-2] + compose(c0, c1, 'ㄹ')
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(l + r)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와 
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ') and (r_first_ == '어' or r_first_ == '아'):
        l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        if l_len >= 2 and (l_last_ == '답' or l_last_ == '곱' or l_last_ == '깝' or l_last_ == '롭'):
            c1 = 'ㅝ'
        elif r_first[1] == 'ㅗ':
            c1 = 'ㅘ'
        elif r_first[1] == 'ㅜ':
            c1 = 'ㅝ'
        elif r_first_ == '어':
            c1 = 'ㅝ'
        else: # r_first_ == '아'
            c1 = 'ㅘ'
        r = compose('ㅇ', c1, r_first[2]) + ending[1:]
        candidates.add(l + r)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ' and r_first[1] == ' ' and (r_first[0] == 'ㄴ' or r_first[0] == 'ㄹ' or r_first[0] == 'ㅂ' or r_first[0] == 'ㅆ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        candidates.add(l + r)

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어    
    if (l_last[2] == 'ㅅ') and (r_first[0] == 'ㅇ'):
        if stem[-1] == '벗':
            l = stem
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l + ending)

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            l = stem[:-1] + compose(l_last[0], 'ㅝ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅏ':
        l = stem[:-1] + compose(l_last[0], 'ㅘ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트') and (r_first[0] == 'ㅇ') and (r_first[1] == 'ㅓ'):
        l = stem[:-1] + compose(l_last[0], r_first[1], r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    if ending[:2] == '어라' or ending[:2] == '아라':
        if l_last[1] == 'ㅏ':            
            r = '거' + ending[1:]
        elif l_last[1] == 'ㅗ':
            r = '너' + ending[1:]
        else:
            r = ending
        candidates.add(stem + r)

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다
    if l_last_ == '르' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(stem + r)

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    if l_last_ == '하' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        # case 1
        r = compose(r_first[0], 'ㅕ', r_first[2]) + ending[1:]
        candidates.add(stem + r)
        # case 2
        l = stem[:-1] + compose('ㅎ', 'ㅐ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and not (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        if r_first[1] == ' ':
            l = l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        if r_first_ == '으':
            r = ending[1:]
        elif r_first[1] == ' ':            
            r = ''
        else:
            r = ending
        candidates.add(l + r)

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        l = stem[:-1] + compose(l_last[0], 'ㅐ' if r_first[1] == 'ㅏ' else 'ㅔ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음
    if l_last[2] == 'ㅎ' and r_first[0] == 'ㄴ' and r_first[1] != ' ':
        candidates.add(stem + ending)

    # 이었 -> 였 규칙활용
    if ending[0] == '었' and l_last[1] == 'ㅣ' and l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅕ', 'ㅆ') + ending[1:])

    if not candidates and r_first[1] != ' ':
        candidates.add(stem + ending)

    return candidates

Esempio n. 29

0

Mostra file

def conjugate(stem, ending, enforce_moum_harmoney=False, debug=False):

    assert ending  # ending must be inserted

    l_len = len(stem)
    l_last = list(decompose(stem[-1]))
    l_last_ = stem[-1]
    r_first = list(decompose(ending[0]))

    # check moum is positive or negative
    # ㅂ 불규칙 활용은 모음조화가 이뤄지지 않는 경우가 있음
    if enforce_moum_harmoney:
        if ((l_last[2] != 'ㅂ' and l_last[1] in positive_moum)
                and (r_first[0] == 'ㅇ' and r_first[1] in negative_moum)):
            r_first[1] = neg_to_pos[r_first[1]]
            ending = compose(*r_first) + ending[1:]
        if ((l_last[2] != 'ㅂ' and l_last[1] in negative_moum)
                and (r_first[0] == 'ㅇ' and r_first[1] in positive_moum)):
            r_first[1] = pos_to_neg[r_first[1]]
            ending = compose(*r_first) + ending[1:]
        if (l_last[1] in neuter_moum) and (r_first[1] in positive_moum):
            r_first[1] = pos_to_neg[r_first[1]]
            ending = compose(*r_first) + ending[1:]

    # -는 vs -ㄴ / -ㄴ, -ㄹ, -ㅂ, -ㅆ
    #if ((l_last[2] == ' ') and
    #    ((r_first[0] == 'ㅇ' or r_first[0] == r_first[2]) and (r_first[1] == 'ㅣ' or r_first[1] == 'ㅡ'))):
    #    r_first = [r_first[2], ' ', ' ']
    #    ending = r_first[2] + ending[1:]

    r_first_ = compose(r_first[0], r_first[1],
                       ' ') if r_first[1] != ' ' else ending[0]

    candidates = set()

    if debug:
        print('l_last = {}'.format(l_last))
        print('r_first = {}'.format(r_first))

    if ending[0] == '다':
        surface = stem + ending
        candidates.add(surface)
        if debug:
            print('\'다\'로 시작하는 어미: {}'.format(surface))

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ' and r_first[0] == 'ㅇ':
        l = stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        surface = l + ending
        candidates.add(surface)
        candidates.add(stem + ending)  # 받 + 았다 -> 받았다
        if debug:
            print('ㄷ 불규칙: {}'.format(surface))

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if ((l_last_ == '르' and stem[-2:] != '푸르')
            and (r_first_ == '아' or r_first_ == '어') and l_len >= 2):
        c0, c1, c2 = decompose(stem[-2])
        l = stem[:-2] + compose(c0, c1, 'ㄹ')
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('르 불규칙: {}'.format(surface))

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        if (r_first_ == '어' or r_first_ == '아'):
            if l_len >= 2 and (l_last_ == '답' or l_last_ == '곱'
                               or l_last_ == '깝' or l_last_ == '롭'):
                c1 = 'ㅝ'
            elif r_first[1] == 'ㅗ':
                c1 = 'ㅘ'
            elif r_first[1] == 'ㅜ':
                c1 = 'ㅝ'
            elif r_first_ == '어':
                c1 = 'ㅝ'
            else:  # r_first_ == '아'
                c1 = 'ㅘ'
            r = compose('ㅇ', c1, r_first[2]) + ending[1:]
            surface = l + r
            candidates.add(surface)
            if debug:
                print('ㅂ 불규칙: {}'.format(surface))
        elif r_first[0] == 'ㅇ':  # 돕 + 울까 = 도울까, 답 + 울까 = 다울까
            surface = l + ending
            candidates.add(surface)
            if debug:
                print('ㅂ 불규칙: {}'.format(surface))

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if r_first[1] == ' ' and (r_first[0] == 'ㄴ' or r_first[0] == 'ㄹ'
                              or r_first[0] == 'ㅁ' or r_first[0] == 'ㅂ'
                              or r_first[0] == 'ㅆ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if r_first[1] != ' ':
            candidates.add(stem + ending)
        if debug:
            print('어미의 첫 글자가 -ㄴ, -ㄹ, -ㅁ-, -ㅂ, -ㅆ 인 경우: {}'.format(surface))

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == 'ㅅ') and (r_first[0] == 'ㅇ'):
        if stem[-1] == '벗':
            l = stem
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        surface = l + ending
        candidates.add(surface)
        if debug:
            print('ㅅ 불규칙: {}'.format(surface))

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[
            1] == 'ㅓ':
        if l_last_ == '푸':
            l = stem[:-1] + '퍼'
        else:
            l = stem[:-1] + compose(l_last[0], 'ㅝ', r_first[2])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('우 불규칙: {}'.format(surface))

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[
            1] == 'ㅏ':
        l = stem[:-1] + compose(l_last[0], 'ㅘ', r_first[2])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('오 활용: {}'.format(surface))

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if ((l_last[1] == 'ㅡ') and (l_last[2] == ' ') and (r_first[0] == 'ㅇ')):
        if l_last[0] == 'ㅇ' and len(stem) > 1:
            surface = stem[:-1] + ending
        elif l_last[0] != 'ㄹ':
            surface = stem[:-1] + compose(l_last[0], r_first[1],
                                          r_first[2]) + ending[1:]
        else:
            surface = None
        if surface is not None:
            candidates.add(surface)
        if debug and surface is not None:
            print('ㅡ 탈락 불규칙: {}'.format(surface))

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용: 최근에는 인정되지 않는 규칙
    if ending[:2] == '어라' or ending[:2] == '아라':
        # 돌아오 + 아라 -> 돌아와라
        if stem[-1] == '오':
            l = stem[:-1]
            r = '와' + ending[1:]
        # 그리우 + 어라 -> 그리워라
        elif stem[-1] == '우':
            l = stem[:-1]
            r = '워' + ending[1:]
        # 가 + 아라 -> 가라
        elif stem[-1] == '가':
            l = stem
            r = ending[1:]
        else:
            if l_last[1] in negative_moum:
                l = stem
                r = '어' + ending[1:]
            else:
                l = stem
                r = '아' + ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('거라/너라 불규칙: {}'.format(surface))

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다
    if ((l_last_ == '르' and stem[-2:] != '구르')
            and (r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ')):
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        surface = stem + r
        candidates.add(surface)
        if debug:
            print('러 불규칙: {}'.format(surface))

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    if l_last_ == '하' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ'
                                                 or r_first[1] == 'ㅓ'):
        # case 1
        r = compose(r_first[0], 'ㅕ', r_first[2]) + ending[1:]
        surface0 = stem + r
        candidates.add(surface0)
        # case 2
        l = stem[:-1] + compose('ㅎ', 'ㅐ', r_first[2])
        r = ending[1:]
        surface1 = l + r
        candidates.add(surface1)
        if debug:
            print('여 불규칙: {}, {}'.format(surface0, surface1))

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다
    if l_last[2] == 'ㅎ' and r_first[1] != ' ':
        if l_last_ == '좋' or l_last_ == '놓':
            l = stem
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        r = ending
        surface = l + r
        candidates.add(surface)
        if debug:
            print('ㅎ 탈락 불규칙: {}'.format(surface))

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if ((l_last[2] == 'ㅎ' and l_last_ != '좋') and
        (r_first[0] == 'ㅇ' and r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ')):
        l = stem[:-1] + compose(l_last[0], 'ㅐ' if r_first[1] == 'ㅏ' else 'ㅔ',
                                r_first[2])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('ㅎ 축약 불규칙: {}'.format(surface))

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음
    if l_last[2] == 'ㅎ' and r_first[0] == 'ㄴ' and r_first[1] != ' ':
        surface = stem + ending
        candidates.add(surface)
        if debug:
            print('ㅎ + 네 불규칙: {}'.format(surface))

    # 이 + 어 -> 여 규칙활용, 만지 + 었어 -> 만졌어, 만지 + 어서 -> 만져서
    if r_first_ == '어' and l_last[1] == 'ㅣ' and l_last[2] == ' ':
        surface = stem[:-1] + compose(l_last[0], 'ㅕ', r_first[2]) + ending[1:]
        candidates.add(surface)
        surface = stem + ending
        candidates.add(surface)
        if debug:
            print('이 + 어 -> 여 규칙: {}'.format(surface))

    if not candidates and r_first[1] != ' ':
        if (l_last[2] == ' ') and (r_first[0] == 'ㅇ') and (r_first[1]
                                                           == l_last[1]):
            l = stem[:-1] + compose(l_last[0], l_last[1], r_first[2])
            r = ending[1:]
            surface = l + r
            candidates.add(surface)
        else:
            surface = stem + ending
            candidates.add(surface)
        if debug:
            print('L + R 규칙 결합: {}'.format(surface))

    return candidates

Esempio n. 30

0

Mostra file

File: _conjugation.py Progetto: tobby2002/soynlp

def conjugate(root, ending):

    assert ending # ending must be inserted

    l_len = len(root)
    l_last = decompose(root[-1])
    l_last_ = root[-1]
    r_first = decompose(ending[0])
    r_first_ = compose(r_first[0], r_first[1], ' ') if r_first[1] != ' ' else ending[0]

    candidates = set()
    
    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ' and r_first[0] == 'ㅇ':
        l = root[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l + ending)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and (r_first_ == '아' or r_first_ == '어') and l_len >= 2:
        c0, c1, c2 = decompose(root[-2])
        l = root[:-2] + compose(c0, c1, 'ㄹ')
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(l + r)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와 
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ') and (r_first_ == '어' or r_first_ == '아'):
        l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        if l_len >= 2 and (l_last_ == '답' or l_last_ == '곱' or l_last_ == '깝' or l_last_ == '롭'):
            c1 = 'ㅝ'
        elif r_first[1] == 'ㅗ':
            c1 = 'ㅘ'
        elif r_first[1] == 'ㅜ':
            c1 = 'ㅝ'
        elif r_first_ == '어':
            c1 = 'ㅝ'
        else: # r_first_ == '아'
            c1 = 'ㅘ'
        r = compose('ㅇ', c1, r_first[2]) + ending[1:]
        candidates.add(l + r)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ' and r_first[1] == ' ' and (r_first[0] == 'ㄴ' or r_first[0] == 'ㄹ' or r_first[0] == 'ㅂ' or r_first[0] == 'ㅆ'):
        l = root[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        candidates.add(l + r)

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어    
    if (l_last[2] == 'ㅅ') and (r_first[0] == 'ㅇ'):
        if root[-1] == '벗':
            l = root
        else:
            l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l + ending)

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            l = root[:-1] + compose(l_last[0], 'ㅝ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅏ':
        l = root[:-1] + compose(l_last[0], 'ㅘ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트') and (r_first[0] == 'ㅇ') and (r_first[1] == 'ㅓ'):
        l = root[:-1] + compose(l_last[0], r_first[1], r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    if ending[:2] == '어라' or ending[:2] == '아라':
        if l_last[1] == 'ㅏ':            
            r = '거' + ending[1:]
        elif l_last[1] == 'ㅗ':
            r = '너' + ending[1:]
        else:
            r = ending
        candidates.add(root + r)

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다
    if l_last_ == '르' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(root + r)

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    if l_last_ == '하' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        # case 1
        r = compose(r_first[0], 'ㅕ', r_first[2]) + ending[1:]
        candidates.add(root + r)
        # case 2
        l = root[:-1] + compose('ㅎ', 'ㅐ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and not (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        if r_first[1] == ' ':
            l = l = root[:-1] + compose(l_last[0], l_last[1], r_first[0])
        else:
            l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        if r_first_ == '으':
            r = ending[1:]
        elif r_first[1] == ' ':            
            r = ''
        else:
            r = ending
        candidates.add(l + r)

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        l = root[:-1] + compose(l_last[0], 'ㅐ' if r_first[1] == 'ㅏ' else 'ㅔ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음
    if l_last[2] == 'ㅎ' and r_first[0] == 'ㄴ' and r_first[1] != ' ':
        candidates.add(root + ending)

    if not candidates and r_first[1] != ' ':
        candidates.add(root + ending)

    return candidates