def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 10 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue soy_tokens = soy_tokenizer.tokenize(word) if ' '.join(tokens) == ' '.join(soy_tokens): continue if is_all_nng(mcab.pos(word)): #print("nouns only : {}".format(word)) #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) continue if len(soy_tokens) > 1: continue #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) words = re.findall(' '.join(tokens), sentences) if len(words) < (cnt * 0.05): # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong)) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
def soy_tokenize(model_fname, input_sentence): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) tokens = tokenizer.tokenize(input_sentence) tokenized_sent = ' '.join(tokens) return tokenized_sent
def soy_tokenize(corpus_fname, model_fname, output_fname): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() normalized_sent = emoticon_normalize(sentence, num_repeats=3) tokens = tokenizer.tokenize(normalized_sent) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 100 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')
temp = [] for p in pos: temp.append(p[0] + "/" + p[1]) s = ' '.join(temp) return s etri_processed_data["title"] = etri_processed_data["title"].progress_apply(concat_text_with_pos) word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model' word_extractor.load(soynlp_model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soyToken = LTokenizer(scores=scores) # soyToken.tokenize(soynlp_processed_data["title"].values[0]) soynlp_processed_data["title"] = soynlp_processed_data["title"].progress_apply(lambda x: " ".join(soyToken.tokenize(x))) token = spm.SentencePieceProcessor() token.Load("./backend/textengines/data/tokenizer_model/sentencepice.model") spm_processed_data["title"] = spm_processed_data["title"].progress_apply(lambda x: " ".join(token.EncodeAsPieces(x))) ############################################################################# td = etri_processed_data.copy() ratio_train = 0.8 ratio_val = 0.1