Example #1
0
def count_eng(source,target):
    print('Source: ', source, '\n')
    print('Target: ', target, '\n')
    s = mecab.pos(source)
    ss = [z[0] for z in s]
    sss = [hgtk.checker.is_latin1(z) for z in ss]
    t = mecab.pos(target)
    tt = [z[0] for z in t]
    ttt = [hgtk.checker.is_latin1(z) for z in tt]
    if sum(sss) == 0:
        return 1
    else:
        recall = 1 - sum(ttt)/sum(sss)
        return 2*(recall)/(recall+1)
Example #2
0
def tokenization(doc, rate):
    for t in mecab.pos(doc):
        if "SH" in t[1] or "SL" in t[1]:
            count(t[0], rate)
        elif "NNB" not in t[1] and "J" not in t[1]:
            if "IC" not in t[1] and "S" not in t[1]:
                if "E" not in t[1] and "X" not in t[1]:
                    count(t[0], rate)
Example #3
0
def tokenize(doc):
    result = []
    for t in mecab.pos(doc):
        if "SH" in t[1] or "SL" in t[1]:
            result.append(t[0])
        #elif t[0] in stop:
        #    continue
        elif "NNB" not in t[1] and "J" not in t[1]:
            if "IC" not in t[1] and "S" not in t[1]:
                if "E" not in t[1] and "X" not in t[1]:
                    result.append(t[0])
    return result
def align_particles(sentence):
    s = sentence.split()
    particles = mecab.pos(sentence)
    chunks = []
    final  = False
    if len(particles) > 0:
        count_word = 0
        morphemes = []
        total = []
        for i in range(len(particles)):
            morphemes.append(particles[i][0])
            total.append(particles[i])
            if i+1 < len(particles):
                morphemes_temp = morphemes[:]
                morphemes_temp.append(particles[i+1][0])
                if "".join(morphemes_temp) not in s[count_word]:
                    chunks.append(total)
                    count_word += 1
                    morphemes = []
                    total = []
            else:
                chunks.append(total)
    return s, particles, chunks
Example #5
0
f.close()

#sys.stdout = open("/home/ask/PycharmProjects/DistilKoBERT/result.txt", "w")
# f = open("/home/ask/PycharmProjects/DistilKoBERT/result.txt", 'w')

mecab = mecab.MeCab()

tokened_str = mecab.morphs(line)
print(tokened_str)
# f.write (' / '.join(tokened_str) + "\n")

tokened_str = mecab.nouns(line)
print(tokened_str)
# f.write (' / '.join(tokened_str) + "\n")

tokened_str = mecab.pos(line)
print(tokened_str)
# for i in tokened_str:
#     f.write('/'.join(i) + "  ")
# f.write("\n")

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
tokened_str = tokenizer.tokenize("[CLS]" + line + "[SEP]")
print(tokened_str)
# f.write(' / '.join(tokened_str) + "\n")

print(tokenizer.convert_tokens_to_ids(tokened_str))
str = "[" + ', '.join(str(e) for e in tokenizer.convert_tokens_to_ids(tokened_str)) + "]"
# f.write (''.join(str))
# f.close()
Example #6
0
def q2propose(q):
    pos = mecab.pos(q.strip())
    nouns = mecab.nouns(q.strip())
    morphs = mecab.morphs(q.strip())
    return pos, nouns, morphs