class SoyNLPTokenizer(BaseTokenizer): """ Tokenize text using MaxScoreTokenizer of SoyNLP """ def __init__(self): self.tokenizer = None self.scores = list() self.word_extractor = WordExtractor(min_count=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) def fit(self, sentences): self.word_extractor.train(sentences) scores = self.word_extractor.extract() scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \ (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()] self.scores = scores self.tokenizer = MaxScoreTokenizer(scores=self.scores) def state_dict(self): return {'scores': self.scores} def load_state_dict(self, state_dict): self.scores = state_dict['scores'] self.tokenizer = MaxScoreTokenizer(scores=self.scores) def tokenize(self, sentence): tokenized_sentence = self.tokenizer.tokenize(sentence) return tokenized_sentence
def build_tokenizer(): """ Train soynlp tokenizer which will be used to tokenize Korean input sentence Returns: """ print(f'Now building soynlp tokenizer . . .') data_dir = Path().cwd() / 'data' train_txt = os.path.join(data_dir, 'train.txt') with open(train_txt, encoding='utf-8') as f: lines = f.readlines() word_extractor = WordExtractor(min_frequency=5) word_extractor.train(lines) word_scores = word_extractor.extract() cohesion_scores = { word: score.cohesion_forward for word, score in word_scores.items() } with open('pickles/tokenizer.pickle', 'wb') as pickle_out: pickle.dump(cohesion_scores, pickle_out)
def data_tokenize(news_title, tdm_vocab): word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_score) cluster_data = [] bert_null_list = [] for title in news_title: title = test(title) sent = tokenizer.tokenize(title, flatten=False) sentence = [] for i in sent: if i[0] in tdm_vocab: sentence.append(i[0]) cluster_data.append(sentence) return cluster_data
def getTokenizer(self, contents): corpus = SentiCorpus(contents, iter_sent=True) word_extractor = WordExtractor(corpus) word_extractor.train(corpus) words_scores = word_extractor.extract() scores = {w: s.cohesion_forward for w, s in words_scores.items()} return LTokenizer(scores=scores)
def build_tokenizer(): """ Train soynlp tokenizer which will be used to tokenize Korean input sentence using whole corpus Returns: """ print(f'Now building soy-nlp tokenizer . . .') data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'train_soynlp.csv') df = pd.read_csv(train_file, encoding='utf-8') # if encounters non-text row, we should skip it kor_lines = [ row.korean for _, row in df.iterrows() if type(row.korean) == str ] word_extractor = WordExtractor(min_frequency=5) word_extractor.train(kor_lines) word_scores = word_extractor.extract() cohesion_scores = { word: score.cohesion_forward for word, score in word_scores.items() } with open('pickles/tokenizer.pickle', 'wb') as pickle_out: pickle.dump(cohesion_scores, pickle_out)
def pmi_test(corpus_path): print('PMI test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True) for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]: pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]]) print('pmi {} = {:.3f}'.format(pair_, pmi)) print('computed PMI')
def soynlp_tokenizer(corpus): from soynlp.tokenizer import LTokenizer from soynlp.word import WordExtractor from soynlp.noun import LRNounExtractor_v2 # word extractor word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(corpus) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # noun extractor noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(corpus) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} combined_scores = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items() } combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores) }) tokenizer = LTokenizer(scores=combined_scores) return tokenizer
def _extracte(self) -> None: self.extractor = WordExtractor() self.extractor.train(self.corpus) self.words = self.extractor.extract() self.cohesion_score = { word: score.cohesion_forward for word, score in self.words.items() } self.tokenizer = LTokenizer(scores=self.cohesion_score)
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 10 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue soy_tokens = soy_tokenizer.tokenize(word) if ' '.join(tokens) == ' '.join(soy_tokens): continue if is_all_nng(mcab.pos(word)): #print("nouns only : {}".format(word)) #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) continue if len(soy_tokens) > 1: continue #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) words = re.findall(' '.join(tokens), sentences) if len(words) < (cnt * 0.05): # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong)) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
def data_tokenize(news_title): word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = {word:score.cohesion_forward for word, score in words.items()} tokenizer = LTokenizer(scores=cohesion_score) return tokenizer
def word_extractor_test(corpus_path): print('WordExtractor test') from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) word_extractor = WordExtractor() word_extractor.train(corpus) word_scores = word_extractor.extract() print('top 20 left frequency * forward cohesion words') topwords = sorted(word_scores, key=lambda x: -word_scores[x].cohesion_forward * word_scores[x].leftside_frequency)[:20] for word in topwords: print('word = {}, cohesion = {}'.format(word, word_scores[word].cohesion_forward)) print('word extractor test has been done\n\n')
def __init__(self, model_path: str = None): self.word_extractor = WordExtractor(min_frequency=5, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) self.unk = 0 self.pad = 1 self.sos = 2 self.eos = 3 if model_path: with open(model_path, 'rb') as readFile: self.cohesion_score = dill.load(readFile) else: self.cohesion_score = {} self.tokenizer = LTokenizer(scores=self.cohesion_score) self.tok_to_id, self.id_to_tok = self._build_dict()
def word_extractor_test(corpus_path): print('WordExtractor test') from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) word_extractor = WordExtractor() word_extractor.train(corpus) word_scores = word_extractor.extract() print('top 20 left frequency * forward cohesion words') topwords = sorted(word_scores, key=lambda x: -word_scores[x].cohesion_forward * word_scores[x].leftside_frequency)[:20] for word in topwords: print('word = {}, cohesion = {}'.format( word, word_scores[word].cohesion_forward)) print('word extractor test has been done\n\n')
def word_extract(datas): we = WordExtractor( min_frequency=10, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) we.train(datas) words = we.extract() print('단어 (빈도수, cohesion, branching entropy)\n') for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:10]: print('%s (%d, %.3f, %.3f)' % ( word, score.leftside_frequency, score.cohesion_forward, score.right_branching_entropy ) ) return
def _get_tokenizer(self, df): """ Generate a torkenizer by extracting words Args: dataframe: data corpus of one language Returns: tokenizer """ word_extractor = WordExtractor() word_extractor.train(df) words = word_extractor.extract() print(f'length of words is {len(words)}') cohesion_scores = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_scores) return tokenizer
def main(args): # Find patterns and extract words from a given set of documents sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True) word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) # word extractor word_extractor.train(sentences) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } print('Word (Freq, cohesion, branching entropy)\n') for word, score in sorted(words.items(), key=lambda x: word_score(x[1]), reverse=True)[:30]: print('%s (%d, %.3f, %.3f)' % (word, score.leftside_frequency, score.cohesion_forward, score.right_branching_entropy)) # noun extractor noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(args.corpus_fname) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} # combined score combined_scores = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items() } combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores) }) # maxScore tokenizer tokenizer = MaxScoreTokenizer(scores=combined_scores) # save tokenizer with open(args.tokenizer_path, 'wb') as f: pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
def pmi_test(corpus_path): print('pmi test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001) rows, cols = x_pmi.nonzero() data = x_pmi.data print('row shape = {}'.format(rows.shape)) print('col shape = {}'.format(cols.shape)) print('data shape = {}'.format(data.shape)) for indpt in data.argsort()[-150:-100]: i = rows[indpt] j = cols[indpt] pair = (idx2vocab[i], idx2vocab[j]) value = data[indpt] print('pmi {} = {:.3f}'.format(pair, value)) print('computed pmi')
def soynlp_tokenizer(self): def word_score(score): return (score.cohesion_forward * math.exp(score.right_branching_entropy)) if self.mode == 'serve': with open(self.data_path, 'r') as file: word_score_dict = json.load(file) elif self.mode == 'train': word_extractor = WordExtractor() word_extractor.train(self.train_corpus) words = word_extractor.extract() word_score_dict = { word:word_score(score) for word, score, in words.items()} with open('./models/word_dict.json', 'w') as file: json.dump(word_score_dict, file) else: pass tokenizer = MaxScoreTokenizer(scores=word_score_dict) return tokenizer
def soy_tokenize(model_fname, input_sentence): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) tokens = tokenizer.tokenize(input_sentence) tokenized_sent = ' '.join(tokens) return tokenized_sent
def compute_soy_word_score(corpus_fname, model_fname): sentences = [sent.strip() for sent in open(corpus_fname, 'r').readlines()] word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(sentences) word_extractor.save(model_fname)
def __init__(self, pre_trained=True, analyzer='Hannanum'): self.pre_trained = pre_trained if analyzer == 'Hannanum': self.analyzer = tag.Hannanum() elif analyzer == 'Kkma': self.analyzer = tag.Kkma() elif analyzer == 'Komoran': self.analyzer = tag.Komoran() elif analyzer == 'Mecab': self.analyzer = tag.Mecab() elif analyzer == 'Okt': self.analyzer = tag.Okt() else: if pre_trained == False: pass else: print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt') self.WordExtractor = WordExtractor(min_frequency=0) self.noun_extractor = LRNounExtractor(verbose=False) self.word_score = {}
def soy_tokenize(corpus_fname, model_fname, output_fname): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() normalized_sent = emoticon_normalize(sentence, num_repeats=3) tokens = tokenizer.tokenize(normalized_sent) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
def build_tokenizer(): """ 입력되는 한국어 문장을 tokenize 할 soynlp tokenizer를 학습한다 """ print(f'Now building soy-nlp tokenizer . . .') data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'corpus.csv') """ 학습되는 데이터가 있는 경로 지정 후 파일을 불러온다 """ df = pd.read_csv(train_file, encoding='utf-8') """ text인 행만 분석한다 """ kor_lines = [ row.korean for _, row in df.iterrows() if type(row.korean) == str ] """ soynlp 모듈에서 가져온 WordExtractor 함수로 branching entropy, accessor variety, cohesion score의 단어 score 도출한다 이 단어 score들은 각각 다른 방법으로 token의 경계를 찾는 값이다 그 중 cohesion score(단어를 구성하는 글자들이 얼마나 같이 나오는지에 대한 값)만 추출한다. 자세한 단어 score의 식과 코드는 https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb 에 자세히 나와있다. """ word_extractor = WordExtractor(min_frequency=5) word_extractor.train(kor_lines) word_scores = word_extractor.extract() cohesion_scores = { word: score.cohesion_forward for word, score in word_scores.items() } """ pickle로 저장한다 """ with open('pickles/tokenizer.pickle', 'wb') as pickle_out: pickle.dump(cohesion_scores, pickle_out)
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 100 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')
def Makegraph_Wordcloud_Soynlp(target): try: if flag_login == 0 or flag_login == None or flag_login == '': Login() #elif flag_prepro == 0: #messagebox.showwarning('주의', '데이터 전처리 후 실행해주세요.') #return else: data_wordcloud_soynlp = pd.DataFrame(data_origin[target], columns=['contents']) data_wordcloud_soynlp['contents'] = data_origin[target].apply( lambda x: re.sub('[^가-힣]', ' ', x)) word_extractor = WordExtractor( min_frequency=10, # 가변화하기 (ex. data_origin.len() 비례) min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(data_wordcloud_soynlp['contents'].values) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # force : 여기인가? # force join words cohesion_score['숙소제공'] = 1 cohesion_score['교통비지급'] = 1 cohesion_score['인센티브'] = 1 cohesion_score['초과근무시간확대'] = 1 cohesion_score['복지포인트'] = 1 cohesion_score['인사우대'] = 1 cohesion_score['근평가점'] = 1 cohesion_score['주거이전수당'] = 1 tokenizer = LTokenizer(scores=cohesion_score) data_wordcloud_soynlp['tokenizer'] = data_wordcloud_soynlp[ 'contents'].apply( lambda x: tokenizer.tokenize(x, remove_r=True)) words = list() for i in data_wordcloud_soynlp['tokenizer'].values: for j in i: words.append(j) count_soynlp = Counter(words) words_dict_soynlp = dict(count_soynlp.most_common(100)) # 빈도 상위 n개 csv_stopwords = pd.read_csv('stopwords.csv', encoding='cp949', skiprows=0) # with open 변경 stopwords = list() for i in csv_stopwords.values: for j in i: stopwords.append(j) for word in stopwords: words_dict_soynlp.pop(word, None) wordcloud = WordCloud( font_path='NanumGothic.ttf', width=500, height=500, background_color='white').generate_from_frequencies( words_dict_soynlp) plt.clf() plt.figure(figsize=(20, 20)) plt.imshow(wordcloud) plt.axis('off') #plt.show() plt.savefig(resultdir + filename_dateflag + target + ' - wordcloud_soynlp.png', dpi=100) ''' # 빈도그래프(temp) plt.clf() plt.style.use('ggplot') plt.figure(figsize = (len(list(words_dict_soynlp.keys())[:20])*0.6, 10)) # grid size 가변화 plt.title('상위 10개 빈출단어') plt.bar(list(words_dict_soynlp.keys())[:20], list(words_dict_soynlp.values())[:20]) plt.xticks(rotation = 45, ha = 'right') # x축 라벨 회전 plt.savefig(resultdir + filename_dateflag + target + ' - wordfrequency.png', dpi = 200) ''' messagebox.showinfo( '작업', '워드클라우드(Soynlp) 생성이 완료되었습니다.\n\nresult폴더에 결과물이 저장되었습니다.') except Exception as e: Log(desc=e) messagebox.showerror('경고', str(e) + ' 열을 찾을 수 없습니다.')
raw_data = [] for sent in news_title: preprocess = test(sent) data.append(preprocess) raw_data.append(sent) # --------------------------토크나이저 로드-------------------- import numpy as np from soynlp.word import WordExtractor from soynlp.utils import DoublespaceLineCorpus from soynlp.tokenizer import LTokenizer word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_score) # # --------------------------word2vec 데이터 전처리-------------------- cluster_data = []
from gensim.models import Word2Vec from gensim.models import KeyedVectors from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from gensim.test.utils import common_texts, get_tmpfile if __name__ == '__main__': # Load Data file_name = 'wikiPlotText_m.txt' # 한국어 Cohesion score 사용 형태소 분석기 corpus = DoublespaceLineCorpus(file_name, iter_sent=True) word_extractor = WordExtractor(corpus) word_extractor.train(corpus) words_scores = word_extractor.extract() scores = {w:s.cohesion_forword for w, s in words_scores.items()} tokenizer = LTokenizer(scores=scores) # {'games':games, 'corpus':corpus, 'titles':titles} games_data = fh.getGamesData(fh.getStoragePath()+file_name, tokenizer=tokenizer) tokenized_contents = games_data['corpus_words'] # Vectorizing # sg=1(skip-gram), 0(CBOW) model_path = 'models/word2vec_ko.model' if os.path.isfile(model_path): word2vec_model = Word2Vec.load(model_path) else: path = get_tmpfile(model_path)
f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 최빈값(상위3개): " + str(Counter(np_single).most_common()[:3])+'\n') #상위 3개 확인 n, bins, patches = plt.hist(np_single, bins=sentence_cnt) # ㅋ이 들어간 문장 중 ㅋ의 평균 출현 횟수에 대한 히스토그램 plt.savefig(result_path+"/"+file_num+".png") f.close() raw_time, raw_chat = read_data(file_name) laugh_check(raw_chat) ''' 통계에 기반하여 단어를 찾아내는 비지도 학습법 1. Accessor Variety 2. Branching Entropy 3. Cohesion score ''' word_extractor = WordExtractor( min_frequency=20, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) #여기서는 Cohesion Score 사용 word_extractor.train(raw_chat) words = word_extractor.extract() ''' print("word extraction 길이: ",len(words), " \n결과: ") print(words) #words_score = {word : score.cohesion_forward for word, score in words.items()} #tokenizer = LTokenizer(scores=words_score) '''
class SoyTokenizer: def __init__(self, model_path: str = None): self.word_extractor = WordExtractor(min_frequency=5, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) self.unk = 0 self.pad = 1 self.sos = 2 self.eos = 3 if model_path: with open(model_path, 'rb') as readFile: self.cohesion_score = dill.load(readFile) else: self.cohesion_score = {} self.tokenizer = LTokenizer(scores=self.cohesion_score) self.tok_to_id, self.id_to_tok = self._build_dict() def tokenize(self, sent: str): return self.tokenizer.tokenize(sent) def text_to_id(self, sent: str): toks = self.tokenize(sent) outp = [] for s in toks: try: outp.append(self.tok_to_id[s]) except KeyError: outp.append(self.unk) return outp def id_to_text(self, idxs: list): return [self.id_to_tok[i] for i in idxs] def train(self, sentences, add_whitespace: bool = False): sentences = self.preprocess(sentences) self.word_extractor.train(sentences) words = self.word_extractor.extract() self.cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # add whitespace tokens if add_whitespace: whitetokens = [] for s in sentences: whitetokens += s.split(' ') whitetokens = list(set(whitetokens)) for t in whitetokens: self.cohesion_score.update({t: 1.0}) self.tok_to_id, self.id_to_tok = self._build_dict() def save_model(self, model_path: str, model_prefix: str): with open(os.path.join(model_path, model_prefix + '.model'), 'wb') as saveFile: dill.dump(self.cohesion_score, saveFile) def _build_dict(self): tok_to_id = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3} id_to_tok = {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>'} for i, key in enumerate(self.cohesion_score.keys()): tok_to_id[key] = i + 4 id_to_tok[i + 4] = key return tok_to_id, id_to_tok def preprocess(self, sents: list): n_str_pattern = re.compile(pattern='[\\d\\-?/_!\\.,]') doublespacing = re.compile(pattern='\\s\\s+') sents = [n_str_pattern.sub(repl=' ', string=w) for w in sents] sents = [doublespacing.sub(repl=' ', string=w).strip() for w in sents] sents = [u.lower() for u in sents] return sents def __len__(self): return len(self.cohesion_score)
temp = dict() temp['noun'] = word.lower() temp['score'] = score temp['freq'] = freq nouns_list.append(temp) df_nouns = pd.DataFrame(nouns_list) df_nouns = df_nouns.sort_values(by=['score'], ascending=False) nouns_candidates_list = df_nouns.loc[df.score > NOUNS_THRESHOLD].noun.tolist() print('nouns_candidates_list : {}\n'.format(len(nouns_candidates_list))) print(''' words extractor ''') word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(corpus) words = word_extractor.extract() words = {k: v for k, v in words.items() if len(k) > 1} words_list = list() for k, v in words.items(): temp = dict() cohesion = v.cohesion_forward branching_entropy = v.left_branching_entropy left_freq = v.leftside_frequency right_freq = v.rightside_frequency score = cohesion * branching_entropy temp['word'] = k.lower()
from soynlp.noun import NewsNounExtractor from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer corpus_path = "text/news/articles.txt" #corpus_path = "text/news/input5-1.txt" corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) #for n_sent, sent in enumerate(corpus): # print('sent %d: %s %s\n'%(n_sent, sent, '' )) we = WordExtractor() we.train(corpus) scores = we.word_scores() print(scores.keys()) ''' sentences = DoublespaceLineCorpus(corpus_path, iter_sent=False) noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) n = nouns.keys() lists="" for a in n: lists+=a lists+=" " print(lists) ''' #top = sorted(nouns.items(), key=lambda x:-x[1].frequency)[:1] #print(top)