async def a_query_func(retrieval, vvoca_docs_d, q_noun, target): s0 = timeit.default_timer() try: jamo_v = vvoca_docs_d.get(q_noun, None) print('===================================================') print('========= start q_noun:%s ==========' % jamo_to_word(q_noun)) print('#### start q_noun:%s #####' % q_noun) if not jamo_v: vocas, score = retrieval.query(q_noun, return_scores=True, k=10) print('vocas, score') print(vocas, score) # voca_ls = [] if vocas.any(): voca_ls = list(map(lambda x: jamo_to_word(x), list(vocas))) print('voca_ls:%s' % voca_ls) one_edit_apart_similar_l = get_one_edit_apart_words( voca_ls, jamo_to_word(q_noun)) print('1.similar_l:%s' % one_edit_apart_similar_l) leven_min_voca, leven_min_score = get_jamo_levenshtein_words( voca_ls, jamo_to_word(q_noun)) print('2.leven_min_voca:%s' % leven_min_voca) jamo_leven_morphs_l = mecab.morphs(leven_min_voca) print('3. jamo_leven_morphs_l:%s' % jamo_leven_morphs_l) q_word_morphs_l = mecab.morphs(jamo_to_word(q_noun)) print('4. q_word_morphs_l:%s' % q_word_morphs_l) vocas_intersec = list( set(jamo_leven_morphs_l) & set(q_word_morphs_l)) voca_intersec = ' '.join(vocas_intersec) print('5. voca_intersec:%s' % voca_intersec) rtvoca = None if one_edit_apart_similar_l and leven_min_voca: if one_edit_apart_similar_l[0] == leven_min_voca: if len(leven_min_voca) >= len(jamo_to_word(q_noun)): print('-------- q_noun:%s --> %s' % (q_noun, leven_min_voca)) rtvoca = leven_min_voca target.append((jamo_to_word(q_noun), rtvoca)) print('#### end q_noun:%s || %s #####' % (q_noun, rtvoca)) print( '========= end q_noun:%s || %s ==== end ======' % (q_noun, rtvoca)) print( '===================================================' ) return print('========= end q_noun:%s || %s ==== end ======' % (q_noun, rtvoca)) print('===================================================') except Exception as e: print('a_query_func q_noun(%s) exception:%s' % (q_noun, e)) ttime = timeit.default_timer() - s0 print('%s time:%s' % (q_noun, ttime))
def tokenize_by_morpheme_sentence(s): o = s r = ' '.join(DEFAULT_ANALYZER(str(s))) r = ' '.join(DEFAULT_ANALYZER(preprocess_clean_text(r))) # r = ' '.join(DEFAULT_ANALYZER(txtclean.lemmatize_raw_text(txtclean.preprocess_raw_text(r)))) r = ' '.join(mecab.morphs(r)) r = r + ' ' + o + ' ' + ' '.join(mecab.nouns(o)) return r
import os import sys import mecab os.chdir(os.getcwd()) mecab = mecab.MeCab() sys.stdout = open("NEW_EXOBRAIN_NE_CORPUS_10000.txt", "w", -1, "utf-8") f = open("EXOBRAIN_NE_CORPUS_10000.txt", "rt", -1, "utf-8") while True: line = f.readline() if not line: break tokened_str = mecab.morphs(line) check = 0 lemma = [] type = [] for i in range(0, len(tokened_str)): if '<' in tokened_str[i]: check = 1 elif check == 0: print(tokened_str[i] + "\t0") elif check == 1: lemma.append(tokened_str[i]) if (i + 1) <= len(tokened_str): if ':' in tokened_str[i + 1]:
def q2propose(q): pos = mecab.pos(q.strip()) nouns = mecab.nouns(q.strip()) morphs = mecab.morphs(q.strip()) return pos, nouns, morphs
def tokenize_by_morpheme_char(s): r = ' '.join(DEFAULT_ANALYZER(str(s).strip())) r = ' '.join(DEFAULT_ANALYZER(preprocess_clean_text(r))) # r = ' '.join(DEFAULT_ANALYZER(txtclean.lemmatize_raw_text(txtclean.preprocess_raw_text(r)))) r = mecab.morphs(r) return r