Example #1
0
async def a_query_func(retrieval, vvoca_docs_d, q_noun, target):
    s0 = timeit.default_timer()
    try:
        jamo_v = vvoca_docs_d.get(q_noun, None)
        print('===================================================')
        print('========= start q_noun:%s ==========' % jamo_to_word(q_noun))
        print('#### start q_noun:%s #####' % q_noun)
        if not jamo_v:
            vocas, score = retrieval.query(q_noun, return_scores=True, k=10)
            print('vocas, score')
            print(vocas, score)
            # voca_ls = []
            if vocas.any():
                voca_ls = list(map(lambda x: jamo_to_word(x), list(vocas)))
                print('voca_ls:%s' % voca_ls)
                one_edit_apart_similar_l = get_one_edit_apart_words(
                    voca_ls, jamo_to_word(q_noun))
                print('1.similar_l:%s' % one_edit_apart_similar_l)

                leven_min_voca, leven_min_score = get_jamo_levenshtein_words(
                    voca_ls, jamo_to_word(q_noun))
                print('2.leven_min_voca:%s' % leven_min_voca)

                jamo_leven_morphs_l = mecab.morphs(leven_min_voca)
                print('3. jamo_leven_morphs_l:%s' % jamo_leven_morphs_l)
                q_word_morphs_l = mecab.morphs(jamo_to_word(q_noun))
                print('4. q_word_morphs_l:%s' % q_word_morphs_l)

                vocas_intersec = list(
                    set(jamo_leven_morphs_l) & set(q_word_morphs_l))
                voca_intersec = ' '.join(vocas_intersec)
                print('5. voca_intersec:%s' % voca_intersec)
                rtvoca = None
                if one_edit_apart_similar_l and leven_min_voca:
                    if one_edit_apart_similar_l[0] == leven_min_voca:
                        if len(leven_min_voca) >= len(jamo_to_word(q_noun)):
                            print('-------- q_noun:%s --> %s' %
                                  (q_noun, leven_min_voca))
                            rtvoca = leven_min_voca
                            target.append((jamo_to_word(q_noun), rtvoca))
                            print('#### end q_noun:%s || %s #####' %
                                  (q_noun, rtvoca))
                            print(
                                '========= end q_noun:%s || %s ==== end ======'
                                % (q_noun, rtvoca))
                            print(
                                '==================================================='
                            )
                            return
            print('========= end q_noun:%s || %s ==== end ======' %
                  (q_noun, rtvoca))
            print('===================================================')
    except Exception as e:
        print('a_query_func q_noun(%s) exception:%s' % (q_noun, e))
    ttime = timeit.default_timer() - s0
    print('%s time:%s' % (q_noun, ttime))
Example #2
0
def tokenize_by_morpheme_sentence(s):
    o = s
    r = ' '.join(DEFAULT_ANALYZER(str(s)))
    r = ' '.join(DEFAULT_ANALYZER(preprocess_clean_text(r)))
    # r = ' '.join(DEFAULT_ANALYZER(txtclean.lemmatize_raw_text(txtclean.preprocess_raw_text(r))))
    r = ' '.join(mecab.morphs(r))
    r = r + ' ' + o + ' ' + ' '.join(mecab.nouns(o))
    return r
Example #3
0
import os
import sys
import mecab

os.chdir(os.getcwd())
mecab = mecab.MeCab()

sys.stdout = open("NEW_EXOBRAIN_NE_CORPUS_10000.txt", "w", -1, "utf-8")
f = open("EXOBRAIN_NE_CORPUS_10000.txt", "rt", -1, "utf-8")

while True:
    line = f.readline()
    if not line:
        break

    tokened_str = mecab.morphs(line)

    check = 0
    lemma = []
    type = []

    for i in range(0, len(tokened_str)):

        if '<' in tokened_str[i]:
            check = 1
        elif check == 0:
            print(tokened_str[i] + "\t0")
        elif check == 1:
            lemma.append(tokened_str[i])
            if (i + 1) <= len(tokened_str):
                if ':' in tokened_str[i + 1]:
Example #4
0
def q2propose(q):
    pos = mecab.pos(q.strip())
    nouns = mecab.nouns(q.strip())
    morphs = mecab.morphs(q.strip())
    return pos, nouns, morphs
Example #5
0
def tokenize_by_morpheme_char(s):
    r = ' '.join(DEFAULT_ANALYZER(str(s).strip()))
    r = ' '.join(DEFAULT_ANALYZER(preprocess_clean_text(r)))
    # r = ' '.join(DEFAULT_ANALYZER(txtclean.lemmatize_raw_text(txtclean.preprocess_raw_text(r))))
    r = mecab.morphs(r)
    return r