Example #1
0
def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein
    
    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))
    
    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))
    
    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))
    
    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))
    
    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(compose('ㄱ', 'ㅏ', 'ㅁ')))
    
    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(character_is_korean('감')))
    
    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(character_is_korean('a')))
    
    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(character_is_jaum('ㅋ')))
    
    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(character_is_jaum('ㅗ')))
    
    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))
    
    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(levenshtein('가나', '가남')))
    
    if 0.1 != levenshtein('가나', '가남', {('나', '남'):0.1}):
        raise ValueError("levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(levenshtein('가나', '가남', {('나', '남'):0.1})))
    
    if 1/3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(jamo_levenshtein('가나', '가남')))
    
    print('all hangle tests have been successed\n\n')
Example #2
0
def jamo_to_word(jamo):
    jamo_list, idx = [], 0
    while idx < len(jamo):
        if not character_is_korean(jamo[idx]):
            jamo_list.append(jamo[idx])
            idx += 1
        else:
            jamo_list.append(jamo[idx:idx + 3])
            idx += 3
    word = ""
    for jamo_char in jamo_list:
        if len(jamo_char) == 1:
            word += jamo_char
        elif jamo_char[2] == "-":
            word += compose(jamo_char[0], jamo_char[1], " ")
        else: word += compose(jamo_char[0], jamo_char[1], jamo_char[2])
    return word
Example #3
0
def jamo_sentence(sent):
    def transform(char):
        if char == ' ':
            return char
        cjj = decompose(char)
        if len(cjj) == 1:
            return cjj
        cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
        return cjj_

    sent_ = []
    for char in sent:
        if character_is_korean(char):
            sent_.append(transform(char))
        else:
            sent_.append(char)
    sent_ = doublespace_pattern.sub(' ', ''.join(sent_))
    return sent_
Example #4
0
def sent_to_jamo(sent):
    def transform(char):
        if char == " ":
            return char

        cjj = decompose(char)
        if len(cjj) == 1:
            return cjj

        cjj_ = "".join(c if c != " " else "-" for c in cjj)
        return cjj_

    sent_ = []
    for char in sent:
        if character_is_korean(char):
            sent_.append(transform(char))
        else:
            sent_.append(char)

    doublespace_pattern = re.compile("\s+")
    sent_ = doublespace_pattern.sub(" ", "".join(sent_))

    return sent_
Example #5
0
def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein

    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))

    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))

    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))

    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))

    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(
            compose('ㄱ', 'ㅏ', 'ㅁ')))

    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(
            character_is_korean('감')))

    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(
            character_is_korean('a')))

    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(
            character_is_jaum('ㅋ')))

    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(
            character_is_jaum('ㅗ')))

    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(
            levenshtein('가나', '가남')))

    if 0.1 != levenshtein('가나', '가남', {('나', '남'): 0.1}):
        raise ValueError(
            "levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(
                levenshtein('가나', '가남', {('나', '남'): 0.1})))

    if 1 / 3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(
            jamo_levenshtein('가나', '가남')))

    print('all hangle tests have been successed\n')