def count_eng(source,target): print('Source: ', source, '\n') print('Target: ', target, '\n') s = mecab.pos(source) ss = [z[0] for z in s] sss = [hgtk.checker.is_latin1(z) for z in ss] t = mecab.pos(target) tt = [z[0] for z in t] ttt = [hgtk.checker.is_latin1(z) for z in tt] if sum(sss) == 0: return 1 else: recall = 1 - sum(ttt)/sum(sss) return 2*(recall)/(recall+1)
def tokenization(doc, rate): for t in mecab.pos(doc): if "SH" in t[1] or "SL" in t[1]: count(t[0], rate) elif "NNB" not in t[1] and "J" not in t[1]: if "IC" not in t[1] and "S" not in t[1]: if "E" not in t[1] and "X" not in t[1]: count(t[0], rate)
def tokenize(doc): result = [] for t in mecab.pos(doc): if "SH" in t[1] or "SL" in t[1]: result.append(t[0]) #elif t[0] in stop: # continue elif "NNB" not in t[1] and "J" not in t[1]: if "IC" not in t[1] and "S" not in t[1]: if "E" not in t[1] and "X" not in t[1]: result.append(t[0]) return result
def align_particles(sentence): s = sentence.split() particles = mecab.pos(sentence) chunks = [] final = False if len(particles) > 0: count_word = 0 morphemes = [] total = [] for i in range(len(particles)): morphemes.append(particles[i][0]) total.append(particles[i]) if i+1 < len(particles): morphemes_temp = morphemes[:] morphemes_temp.append(particles[i+1][0]) if "".join(morphemes_temp) not in s[count_word]: chunks.append(total) count_word += 1 morphemes = [] total = [] else: chunks.append(total) return s, particles, chunks
f.close() #sys.stdout = open("/home/ask/PycharmProjects/DistilKoBERT/result.txt", "w") # f = open("/home/ask/PycharmProjects/DistilKoBERT/result.txt", 'w') mecab = mecab.MeCab() tokened_str = mecab.morphs(line) print(tokened_str) # f.write (' / '.join(tokened_str) + "\n") tokened_str = mecab.nouns(line) print(tokened_str) # f.write (' / '.join(tokened_str) + "\n") tokened_str = mecab.pos(line) print(tokened_str) # for i in tokened_str: # f.write('/'.join(i) + " ") # f.write("\n") tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') tokened_str = tokenizer.tokenize("[CLS]" + line + "[SEP]") print(tokened_str) # f.write(' / '.join(tokened_str) + "\n") print(tokenizer.convert_tokens_to_ids(tokened_str)) str = "[" + ', '.join(str(e) for e in tokenizer.convert_tokens_to_ids(tokened_str)) + "]" # f.write (''.join(str)) # f.close()
def q2propose(q): pos = mecab.pos(q.strip()) nouns = mecab.nouns(q.strip()) morphs = mecab.morphs(q.strip()) return pos, nouns, morphs