Ejemplo n.º 1
0
def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein
    
    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))
    
    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))
    
    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))
    
    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))
    
    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(compose('ㄱ', 'ㅏ', 'ㅁ')))
    
    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(character_is_korean('감')))
    
    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(character_is_korean('a')))
    
    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(character_is_jaum('ㅋ')))
    
    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(character_is_jaum('ㅗ')))
    
    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))
    
    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(levenshtein('가나', '가남')))
    
    if 0.1 != levenshtein('가나', '가남', {('나', '남'):0.1}):
        raise ValueError("levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(levenshtein('가나', '가남', {('나', '남'):0.1})))
    
    if 1/3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(jamo_levenshtein('가나', '가남')))
    
    print('all hangle tests have been successed\n\n')
Ejemplo n.º 2
0
def get_correction(test_word):
    words_ed1 = get_ed1_words(test_word)
    test_words = words_ed1 + [test_word]
    
    wrong_words = []
    for test_ in test_words:
        if word_count_ed[0][test_] ==0:
            wrong_words.append(test_)
    print(wrong_words)
    if not wrong_words :
        return test_word
    
    longest_word = sorted(wrong_words, key=lambda x : len(x), reverse=True )[0]
    
    length = len(longest_word)
    for char in longest_word:
        have_char = [True if x.find(char)!=-1 else False for x in wrong_words]
        if sum(have_char)==length:
            typo = char
            
    typo_idx = test_word.index(typo)
    pre_word = test_word[:typo_idx]
    post_word = test_word[typo_idx+1:]
    exception_word = pre_word + post_word

    for product in word_count[0].keys():
        if levenshtein(exception_word, product) ==1:
            correction_word = product
            return correction_word
    
    return test_word
Ejemplo n.º 3
0
 def search_book(self, search_text, n=10):
     search_table = self.book_table.copy()
     search_table["JamoEditDis"] = [soyh.jamo_levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()]
     search_table["levEditDis"] = [soyh.levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()]
     search_table["EditDis"] = np.mean(search_table[["JamoEditDis", "levEditDis"]], axis=1)
     search_table = search_table.sort_values("EditDis")
     return search_table.head(n)
Ejemplo n.º 4
0
def get_editdistance1(combination, bigram_lexicon):
    ed1_word = []
    for lexicon_word in bigram_lexicon:
        ed = levenshtein(combination, lexicon_word)
        if ed==0:
            return [combination]
        elif ed<=2:
            ed1_word.append(lexicon_word)
    return ed1_word
Ejemplo n.º 5
0
start_time = time.time()
ed_list = []
eps = 10e-4
dynamic_dict = defaultdict(Counter)
for word in lexicon['preprocess']:
    min_ed = []
    word_split = word.split('_')
    for idx, each in enumerate(example):
        each_ed = []
        for word_ in word_split:
            if dynamic_dict[idx][word_] !=0:  
                er = dynamic_dict[idx][word_]
                each_ed.append(er)
            else:
                ed = levenshtein(each.lower(), word_.lower())
                er = 1- (ed/(max(len(word_), len(each))))
                if er <= 0.4:
                    er = 0 + eps
#             print(f'"{each.lower()}"와 "{word.lower()}"속 단어 "{word_.lower()}" 의  raw ED 값 : {ed},  1- levenshtein 값  : {er}')
#             print(f'"{each.lower()}"와 "{word_.lower()}" 의  raw ED 값 : {ed},  1- levenshtein 값  : {er}')
                each_ed.append(er)
                dynamic_dict[idx][word_] = er
                ### 2021-01-28 추가 ##
                if er == 1.0:
                    break
#         min_ed.append(ed_ratio[idx] * max(each_ed))
        min_ed.append(total_ratio[idx] * max(each_ed))
#         min_ed.append(min(each_ed))
    ed_list.append(sum(min_ed)/len(example))
Ejemplo n.º 6
0
def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein

    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))

    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))

    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))

    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))

    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(
            compose('ㄱ', 'ㅏ', 'ㅁ')))

    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(
            character_is_korean('감')))

    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(
            character_is_korean('a')))

    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(
            character_is_jaum('ㅋ')))

    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(
            character_is_jaum('ㅗ')))

    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(
            levenshtein('가나', '가남')))

    if 0.1 != levenshtein('가나', '가남', {('나', '남'): 0.1}):
        raise ValueError(
            "levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(
                levenshtein('가나', '가남', {('나', '남'): 0.1})))

    if 1 / 3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(
            jamo_levenshtein('가나', '가남')))

    print('all hangle tests have been successed\n')
Ejemplo n.º 7
0
def get_ed_1(test_word,word_count_split):
    ed_1 = []
    for lexicon_word in word_count_split:
        if levenshtein(test_word, lexicon_word)==1:
            ed_1.append(lexicon_word)
    return ed_1
Ejemplo n.º 8
0

# In[7]:


word_count[0]


# In[112]:


# test ed search
test_word = '아몬디'
ed_list = {}
for lexicon_word in word_count[0].keys():
    if levenshtein(test_word, lexicon_word)==1:
        print(lexicon_word)
#     ed = jamo_levenshtein(test_word, lexicon_word)
#     ed_list[lexicon_word] = round(ed,3)
    
# ed_df = pd.DataFrame({'food' : list(ed_list.keys()), 'ed' : list(ed_list.values())}, index=range(len(ed_list))).sort_values(by='ed')
# min_ed = ed_df['ed'].values[0]
# if min_ed > 0.667: #한글자만 다른경우 0.667 마지노선
#     print('better to keep rather than correct')
# else:
#     corr = ed_df['food'].values[0]


# In[113]:

Ejemplo n.º 9
0
#     print(MIN, '         ', MIN/(len(ebs_stnc)+len(google_stnc)))
#     print()
#     print('ebs : ',ebs_stnc)
#     print()
#     print('google : ',MIN_stnc)
#     print()
#     print('*******'*10)


# # 유사 문장 확인

# In[18]:


for e, g in zip(ws_removed_EBS, most_similar_GOOGLE):
    nor_num = levenshtein(e, g)/((len(e)+len(g))/2)
    print('*********'*10)
    if nor_num < 0.1:
        print(levenshtein(e, g), '       ', nor_num, '     GOOD')
    elif nor_num > 0.5:
        print(levenshtein(e, g), '       ', nor_num, '     BAAD')
    else:
        print(levenshtein(e, g), '       ', nor_num)  
    print()
    print('index : ', ws_removed_EBS.index(e),  ' ',ws_removed_GOOGLE.index(g))
    print()
    print('ebs : ', e)
    print()
    print('google : ', g)
    print()