def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 10 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            soy_tokens = soy_tokenizer.tokenize(word)
            if ' '.join(tokens) == ' '.join(soy_tokens):
                continue

            if is_all_nng(mcab.pos(word)):
                #print("nouns only : {}".format(word))
                #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))
                continue

            if len(soy_tokens) > 1:
                continue

            #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))

            words = re.findall(' '.join(tokens), sentences)
            if len(words) < (cnt * 0.05):
                # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류
                (cho, jung, jong) = hgtk.letter.decompose(word[-1])
                if 'ㄱ' <= jong <= 'ㅎ':
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
                else:
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
                print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong))
                f2.writelines(dic_line + '\n')
                f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
Beispiel #2
0
def soy_tokenize(model_fname, input_sentence):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward *
              math.exp(scores[key].right_branching_entropy))
        for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    tokens = tokenizer.tokenize(input_sentence)
    tokenized_sent = ' '.join(tokens)

    return tokenized_sent
Beispiel #3
0
def soy_tokenize(corpus_fname, model_fname, output_fname):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0
                                   )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()

    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            normalized_sent = emoticon_normalize(sentence, num_repeats=3)
            tokens = tokenizer.tokenize(normalized_sent)
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
Beispiel #4
0
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 100 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            (cho, jung, jong) = hgtk.letter.decompose(word[-1])
            if 'ㄱ' <= jong <= 'ㅎ':
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
            else:
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
            f2.writelines(dic_line + '\n')
            f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')
Beispiel #5
0
    temp = []
    for p in pos:
        temp.append(p[0] + "/" + p[1])
    
    s = ' '.join(temp)
    return s
etri_processed_data["title"] = etri_processed_data["title"].progress_apply(concat_text_with_pos)


word_extractor = WordExtractor(
    min_frequency=100,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
)
soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model'
word_extractor.load(soynlp_model_fname)
scores = word_extractor.word_scores()
scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
soyToken = LTokenizer(scores=scores)
# soyToken.tokenize(soynlp_processed_data["title"].values[0])
soynlp_processed_data["title"] = soynlp_processed_data["title"].progress_apply(lambda x: " ".join(soyToken.tokenize(x)))

token = spm.SentencePieceProcessor()
token.Load("./backend/textengines/data/tokenizer_model/sentencepice.model")
spm_processed_data["title"] = spm_processed_data["title"].progress_apply(lambda x: " ".join(token.EncodeAsPieces(x)))
#############################################################################

td = etri_processed_data.copy()

ratio_train = 0.8
ratio_val = 0.1