def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n\n')
def data_tokenize(news_title, tdm_vocab): word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_score) cluster_data = [] bert_null_list = [] for title in news_title: title = test(title) sent = tokenizer.tokenize(title, flatten=False) sentence = [] for i in sent: if i[0] in tdm_vocab: sentence.append(i[0]) cluster_data.append(sentence) return cluster_data
def prediction(text): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) eos_idx = eng.vocab.stoi['<eos>'] # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(text) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to(params.device) # [1, source_len]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) # [1, max_len] encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): if next_symbol == eos_idx: break target[0][i] = next_symbol decoder_output, _ = model.decoder(target, source, encoder_output) # [1, target length, output dim] prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() #eos_idx = torch.where(target[0] == eos_idx)[0][0] #eos_idx = eos_idx.item() eos_index = 34 print(eos_idx) target = target[0][:eos_idx].unsqueeze(0) # translation_tensor = [target length] filed with word indices target, attention_map = model(source, target) target = target.squeeze(0).max(dim=-1)[1] reply_token = [eng.vocab.itos[token] for token in target if token != 3] print(reply_token) #translation = translated_token[:translated_token.index('<eos>')] #translation = ''.join(translation) reply = ' '.join(reply_token) #print(reply) #display_attention(tokenized, reply_token, attention_map[4].squeeze(0)[:-1]) return reply
class LTokenizerKorean(SpecialTokenizer): # 어근 중심 tokenizer def __init__(self, scores=None): from soynlp.tokenizer import LTokenizer self.inst=LTokenizer(scores=scores) # scores를 preference로 지정할 수 있고, 지정하지 않으면 cohesion score로 알아서 계산됨 self.OUT_TYPE = [list, str] def __call__(self, *args, **kwargs): tokens = self.inst.tokenize(args[0]) return tokens
class LTokenizerKorean(SpecialTokenizer): def __init__(self, scores=None): from soynlp.tokenizer import LTokenizer self.inst = LTokenizer(scores=scores) self.OUT_TYPE = [list, str] def __call__(self, *args, **kwargs): tokens = self.inst.tokenize(args[0]) return tokens
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 10 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue soy_tokens = soy_tokenizer.tokenize(word) if ' '.join(tokens) == ' '.join(soy_tokens): continue if is_all_nng(mcab.pos(word)): #print("nouns only : {}".format(word)) #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) continue if len(soy_tokens) > 1: continue #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) words = re.findall(' '.join(tokens), sentences) if len(words) < (cnt * 0.05): # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong)) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
def predict(config): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() input = clean_text(config.input) # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(input) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to( params.device) # [1, source length]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): target[0][i] = next_symbol dec_output = model.decoder(target, source, encoder_output) # dec_output = [1, target length, output dim] prob = dec_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() # translation_tensor = [target length] filed with word indices target = model(source, target) target = torch.argmax(target.squeeze(0), -1) # target = target.squeeze(0).max(dim=-1, keepdim=False) translation = [eng.vocab.itos[token] for token in target][1:] translation = ' '.join(translation) print(f'kor> {config.input}') print(f'eng> {translation.capitalize()}')
def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError( "regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터': 0.4, '데이': 0.35, '데이터센터': 0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError( "ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}". format(ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({ '데이터': 0.4, '데이': 0.35, '데이터센터': 0.38 }) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError( "maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n')
def soy_tokenize(model_fname, input_sentence): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) tokens = tokenizer.tokenize(input_sentence) tokenized_sent = ' '.join(tokens) return tokenized_sent
def predict_sequential(config): # load tokenizer and torchtext Field pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_vocab = open('pickles/text.pickle', 'rb') text = pickle.load(pickle_vocab) pad_idx = text.vocab.stoi[text.pad_token] model_type = { 'vanilla_rnn': RNN(config, pad_idx), 'bidirectional_lstm': BidirectionalLSTM(config, pad_idx), } # select model and load trained model model = model_type[config.model] model.load_state_dict(torch.load(config.save_model)) model.eval() # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(config.input) indexed = [text.vocab.stoi[token] for token in tokenized] length = [len(indexed)] tensor = torch.LongTensor(indexed).to(device) # [input length] tensor = tensor.unsqueeze( 1) # [input length, 1] for adding batch dimension length_tensor = torch.LongTensor(length) prediction = torch.sigmoid(model(tensor, length_tensor)) label = torch.round(prediction) if label == 1: label = 'Positive' else: label = 'Negative' sentiment_percent = prediction.item() print(f'[in] >> {config.input}') print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
def predict_cnn(config): # load tokenizer and torchtext Field pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_vocab = open('pickles/text.pickle', 'rb') text = pickle.load(pickle_vocab) model = CNN(config) model.load_state_dict(torch.load(config.save_model)) model.to(device) model.eval() tokenized = tokenizer.tokenize(config.input) min_len = config.filter_sizes[-1] # if user's input sentence is shorter than the largest filter size, add pad tokens to input sentence if len(tokenized) < min_len: tokenized += ['<pad>'] * (min_len - len(tokenized)) indexed = [text.vocab.stoi[token] for token in tokenized] length = [len(indexed)] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) length_tensor = torch.LongTensor(length) prediction = torch.sigmoid(model(tensor, length_tensor)) label = torch.round(prediction) if label == 1: label = 'Positive' else: label = 'Negative' sentiment_percent = prediction.item() print(f'[in] >> {config.input}') print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
def content_to_token(text_file_name): print("opening file " + text_file_name) with open(text_file_name, 'r', encoding="utf-8") as f: lines = f.read().splitlines() re.sub(r"[\[\]<>~]", ' ', lines[0]) re.sub(r"['~]", ' ', lines[0]) re.sub(r'"', ' ', lines[0]) text = [] for line in lines: line = re.sub(r"[\[\]<>~]", ' ', line) line = re.sub(r"['~]", ' ', line) line = re.sub(r'"', ' ', line) line = re.sub('\\W', ' ', line) text.append(line) ltokenizer = LTokenizer(scores=scores_dictionary) print("making list of words") words = [] for sent in text: conclude_sent = [] #flatten을 False로 주어서 [L명사, R조사]형태로 분류하게 만듦. pre_list = ltokenizer.tokenize(sent, flatten=False) for LR_list in pre_list: word = LR_list[0] if word in word_dict: word = word_dict[word] if word not in exception_list: conclude_sent.append(word) words.append(conclude_sent) token_file_name = text_file_name[:-4] + '.csv' f = open(token_file_name, 'w', newline="") wr = csv.writer(f) for word in words: wr.writerow(word) f.close()
def pad_sentence(dataframe, min_len): """ to use CNN, all the inputs has the minimum length as same as the largest filter size if the input is shorter than the largest CNN filter size, we should pad that input using pad_sentence method Args: dataframe: (DataFrame) dataframe used to train and validate the model min_len: (integer) the largest CNN filter size used to set the minimum length of the model Returns: """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) for i, row in dataframe.iterrows(): tokenized = tokenizer.tokenize(row.document) if len(tokenized) < min_len: tokenized += ['<pad>'] * (min_len - len(tokenized)) padded_sent = ' '.join(tokenized) dataframe.at[i, 'document'] = padded_sent return dataframe
def soy_tokenize(corpus_fname, model_fname, output_fname): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() normalized_sent = emoticon_normalize(sentence, num_repeats=3) tokens = tokenizer.tokenize(normalized_sent) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
class SoyTokenizer: def __init__(self, model_path: str = None): self.word_extractor = WordExtractor(min_frequency=5, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) self.unk = 0 self.pad = 1 self.sos = 2 self.eos = 3 if model_path: with open(model_path, 'rb') as readFile: self.cohesion_score = dill.load(readFile) else: self.cohesion_score = {} self.tokenizer = LTokenizer(scores=self.cohesion_score) self.tok_to_id, self.id_to_tok = self._build_dict() def tokenize(self, sent: str): return self.tokenizer.tokenize(sent) def text_to_id(self, sent: str): toks = self.tokenize(sent) outp = [] for s in toks: try: outp.append(self.tok_to_id[s]) except KeyError: outp.append(self.unk) return outp def id_to_text(self, idxs: list): return [self.id_to_tok[i] for i in idxs] def train(self, sentences, add_whitespace: bool = False): sentences = self.preprocess(sentences) self.word_extractor.train(sentences) words = self.word_extractor.extract() self.cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # add whitespace tokens if add_whitespace: whitetokens = [] for s in sentences: whitetokens += s.split(' ') whitetokens = list(set(whitetokens)) for t in whitetokens: self.cohesion_score.update({t: 1.0}) self.tok_to_id, self.id_to_tok = self._build_dict() def save_model(self, model_path: str, model_prefix: str): with open(os.path.join(model_path, model_prefix + '.model'), 'wb') as saveFile: dill.dump(self.cohesion_score, saveFile) def _build_dict(self): tok_to_id = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3} id_to_tok = {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>'} for i, key in enumerate(self.cohesion_score.keys()): tok_to_id[key] = i + 4 id_to_tok[i + 4] = key return tok_to_id, id_to_tok def preprocess(self, sents: list): n_str_pattern = re.compile(pattern='[\\d\\-?/_!\\.,]') doublespacing = re.compile(pattern='\\s\\s+') sents = [n_str_pattern.sub(repl=' ', string=w) for w in sents] sents = [doublespacing.sub(repl=' ', string=w).strip() for w in sents] sents = [u.lower() for u in sents] return sents def __len__(self): return len(self.cohesion_score)
for noun, score in noun_scores.items() } print("update combining scores") combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_scores.items() if not (subword in combined_scores) }) ## tokenizing #%% print("making tokenizer object") ltokenizer = LTokenizer(scores=combined_scores) #%% print("making list of words") words = [ltokenizer.tokenize(sent) for sent in text] word_list = [] f = open('token_words.csv', 'w', newline="") wr = csv.writer(f) for word in words: wr.writerow(word) word_list.append(word) f.close() with open('token_word_list.pickle', 'wb') as fw: pickle.dump(word_list, fw) print("dumping complete")
trans_raw.append(chat2) else: trans_raw.append(chat) return trans_raw raw_time, raw_chat = read_data("399807785.csv") raw_chat = laugh_trans(raw_chat) word_extractor.train(raw_chat) test_words = word_extractor.extract() test_score = {word:score.cohesion_forward for word, score in test_words.items()} tokenizer = LTokenizer(scores=test_score) test_list = [] cnt = 0 for sent in raw_chat: test_list.append([tokenizer.tokenize(sent)]) cnt += 1 test_tokens = [token for data in test_list for token in data[0]] test_text = nltk.Text(test_tokens) selected_tokens= [t[0] for t in test_text.vocab().most_common(500)] def term_frequency(data): return [data.count(word) for word in selected_tokens] test_x = [term_frequency(d) for d in test_list] X_test = np.asarray(test_x).astype('float32') cnt = 0 print(len(model.predict_classes(X_test))) #13374 for i in range(len(X_test)): if model.predict_classes(X_test)[i] == [0]:
class KoreanTokenizer: ''' A class to tokenize a Korean sentence. Attributes ---------- **kwargs | Keyword arguments for WordExtractor object (see soynlp.word.WordExtractor) Methods ------- train | Trains KoreanTokenizer on a corpus tokenize | Tokenizes the input sentence and returns its tokens ''' from soynlp.word import WordExtractor from soynlp.utils import check_corpus from soynlp.utils import DoublespaceLineCorpus from soynlp.tokenizer import LTokenizer def __init__(self, **kwargs): if 'sents' in kwargs.keys(): del kwargs['sents'] print("WARNING: 'sents' argument is ignored.") self.WordExtractor = WordExtractor(**kwargs) def train(self, text, **kwargs): ''' A method to train the KoreanTokenizer object. Attributes ---------- text : iterable or DoublespaceLineCorpus | A input text in any iterable type (e.g. list) | or a DoublespaceLineCorpus object (see soynlp.utils.DoublespaceLineCorpus) **kwargs | Keyword arguments for WordExtractor.train() method (see soynlp.word.WordExtractor.train) ''' if 'sents' in kwargs.keys(): del kwargs['sents'] print("WARNING: 'sents' argument is ignored; WordExtractor is trained on 'text' argument only.") self.WordExtractor.train(text, **kwargs) self.words = self.WordExtractor.extract() def calculate_word_score(word, score): cohesion = score.cohesion_forward branching_entropy = score.right_branching_entropy word_score = cohesion * exp(branching_entropy) return word_score self.word_score = {word:calculate_word_score(word, score) for word, score in words.items()} def tokenize(self, text, **kwargs): ''' A method to tokenize the input text Attributes ---------- text : str | An input text in str type **kwargs | Keyword arguments for LTokenizer.tokenize() method (see soynlp.tokenizer.LTokenizer.tokenize) ''' if 'sentence' in kwargs.keys(): del kwargs['sentence'] print("WARNING: 'sentence' argument is ignored; word_tokenizer tokenizes 'text' argument only.") if not self.word_score: print('KoreanTokenizer should be trained first, before tokenizing.') return self.tokenizer = LTokenizer(scores=self.word_score) result = self.tokenizer.tokenize(text, **kwargs) return result
'있다', '수', '에', '이', '한다', '있습니다', '것으로', '있는', '것', '할', '및', 'the', 'http', 'https', 'sunday', 'joins', 'co', 'and', 'kr', '고', '것이다', '한', 'is', 'www', 'for', 'a', 'of', 'in', 'on', '중', '더', '대', '통해' ] ## noun score # 명사만한 noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=False) # 복합어 추출 X nouns = noun_extractor.train_extract(text) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} print("extracting noun") ltokenizer = LTokenizer(scores=noun_scores) print("making list of words") words = [] for sent in text: conclude_sent = [] #flatten을 False로 주어서 [L명사, R조사]형태로 분류하게 만듦. pre_list = ltokenizer.tokenize(sent, flatten=False) for LR_list in pre_list: if LR_list[0] not in exception_list: conclude_sent.append(LR_list[0]) words.append(conclude_sent) f = open('noun_token_words_4.csv', 'w', newline="") wr = csv.writer(f) for word in words: wr.writerow(word) f.close()
for noun in nouns: nounScore[noun] = nouns[noun].score dictionary[noun] = index index += 1 del index #여기서 빼낸 명사랑, 벡터화~키워드 추출에서 사용하는 단어의 수가 다른듯한데 nounData = list(dictionary.keys()) ''' writefp = open("C:\\users\\user\Desktop\SWProject\SWproject_getKeywords\WordScore_pebble.txt", "w", encoding = "utf-8") writefp.write(str(nounScore)) writefp.close() ''' tokenizer = LTokenizer(scores=nounScore) tokenized_text = tokenizer.tokenize(text) vectorizer = BaseVectorizer(tokenizer=tokenizer, min_tf=0, max_tf=10000, min_df=0, max_df=1.0, stopwords=None, lowercase=True, verbose=True) sents.iter_sent = False #x = vectorizer.fit_transform(sents) import tensorflow as tf vectorPath = "./vectorizedPebble.mtx"
word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_score) # # --------------------------word2vec 데이터 전처리-------------------- cluster_data = [] for k, title in enumerate(news_title): title = test(title) sent = tokenizer.tokenize(title, flatten=False) sentence = [] # sent -> ['단어', ''] for i in sent: sentence.append(i[0]) cluster_data.append(sentence) # --------------------------Word2Vec embedding-------------------- from gensim.models import Word2Vec model = Word2Vec(cluster_data, size=100, window=3, min_count=5,
def predict(config): params_dict = { 'seq2seq': Params('configs/params.json'), 'seq2seq_gru': Params('configs/params_gru.json'), 'seq2seq_attention': Params('configs/params_attention.json'), } params = params_dict[config.model] # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) model_type = { 'seq2seq': Seq2Seq, 'seq2seq_gru': Seq2SeqGRU, 'seq2seq_attention': Seq2SeqAttention, } # select model and load trained model model = model_type[config.model](params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() input = clean_text(config.input) # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(input) indexed = [kor.vocab.stoi[token] for token in tokenized] source_length = torch.LongTensor([len(indexed)]).to(params.device) tensor = torch.LongTensor(indexed).unsqueeze(1).to( params.device) # [source length, 1]: unsqueeze to add batch size if config.model == 'seq2seq_attention': translation_tensor_logits, attention = model(tensor, source_length, None, 0) # translation_tensor_logits = [target length, 1, output dim] translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1) # translation_tensor = [target length] filed with word indices translation = [eng.vocab.itos[token] for token in translation_tensor][1:] attention = attention[1:] display_attention(tokenized, translation, attention) else: translation_tensor_logits = model(tensor, source_length, None, 0) translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1) translation = [eng.vocab.itos[token] for token in translation_tensor][1:] translation = ' '.join(translation) print(f'kor> {config.input}') print(f'eng> {translation.capitalize()}')
class Embedding: MODEL_SAVED_DIR = "saved_model/fasttext.model" TOKENIZER_SAVED_DIR = "saved_model\\tokenizer.pkl" def __init__(self, dataset: pd.DataFrame, word_train: bool): self.dataset = dataset self.corpus = dataset["TITLE"] + dataset["TEXTCONTENT"] if word_train == False: self.fasttext = FastText.load(self.MODEL_SAVED_DIR) self._load_tokenizer() self._tokenize() else: self._extracte() self._tokenize() self._save_tokenizer() self._train() self.idx_word_dict = dict( zip(np.arange(4, len(self.fasttext.wv.vectors) + 4), self.fasttext.wv.index2word)) self.idx_word_dict[0] = '<PAD>' self.idx_word_dict[1] = '<STA>' self.idx_word_dict[2] = '<EOS>' self.idx_word_dict[3] = '<UNK>' def _extracte(self) -> None: self.extractor = WordExtractor() self.extractor.train(self.corpus) self.words = self.extractor.extract() self.cohesion_score = { word: score.cohesion_forward for word, score in self.words.items() } self.tokenizer = LTokenizer(scores=self.cohesion_score) def _tokenize(self) -> pd.DataFrame: self.corpus = self.corpus.apply( lambda text: self.tokenizer.tokenize(text)) self.dataset["TITLE"] = self.dataset["TITLE"].apply( lambda text: self.tokenizer.tokenize(text)) self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply( lambda text: self.tokenizer.tokenize(text)) def _save_tokenizer(self) -> None: with open(self.TOKENIZER_SAVED_DIR, "wb") as f: pickle.dump(self.tokenizer, f, pickle.HIGHEST_PROTOCOL) def _load_tokenizer(self) -> None: with open(self.TOKENIZER_SAVED_DIR, "rb") as f: self.tokenizer = pickle.load(f) def _train(self) -> None: self.fasttext = FastText(sentences=self.corpus, size=100, window=5, min_count=1, iter=100) self.fasttext.save(self.MODEL_SAVED_DIR) def dataset_to_embedding(self) -> pd.DataFrame: self.dataset["TITLE_IDX"] = self.dataset["TITLE"].apply( self._sentence_length_fix, args=[10]) self.dataset["TITLE"] = self.dataset["TITLE"].apply( self._sentence_length_fix, args=[10]) self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply( self._sentence_length_fix, args=[32]) for index, value in self.dataset["TITLE_IDX"].iteritems(): assert len(value) == 10 for index, value in self.dataset["TITLE"].iteritems(): assert len(value) == 10 for index, value in self.dataset["TEXTCONTENT"].iteritems(): assert len(value) == 32 self.dataset["TITLE_IDX"] = self.dataset["TITLE_IDX"].apply( lambda tokenized: np.array( [self._word_to_idx(token) for token in tokenized])) self.dataset["TITLE"] = self.dataset["TITLE"].apply( lambda tokenized: np.array( [self._word_to_vec(token) for token in tokenized])) self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply( lambda tokenized: np.array( [self._word_to_vec(token) for token in tokenized])) return self.dataset def embedding_to_sentence(self, target: list or np.array) -> list: return [self._vec_to_word(vector) for vector in target] def _sentence_length_fix(self, sentence: list or np.array, length: int) -> list or np.array: sentence_length = len(sentence) if sentence_length < length: while len(sentence) < length: sentence.append('<PAD>') elif sentence_length > length: sentence = sentence[:length] return sentence def _vec_to_word(self, vector) -> str: if np.array_equal(vector, np.eye(100, dtype=np.float32)[0]): return '<PAD>' elif np.array_equal(vector, np.eye(100, dtype=np.float32)[1]): return '<STA>' elif np.array_equal(vector, np.eye(100, dtype=np.float32)[2]): return '<EOS>' elif np.array_equal(vector, np.eye(100, dtype=np.float32)[3]): return '<UNK>' return self.fasttext.wv.most_similar(positive=[vector], topn=1)[0][0] def _word_to_vec(self, word) -> np.array: try: if word == '<PAD>': return np.eye(100, dtype=np.float32)[0] elif word == '<STA>': return np.eye(100, dtype=np.float32)[1] elif word == '<EOS>': return np.eye(100, dtype=np.float32)[2] elif word == '<UNK>': return np.eye(100, dtype=np.float32)[3] return self.fasttext.wv.word_vec(word) except: return np.eye(100, dtype=np.float32)[3] def _word_to_idx(self, word) -> int: try: return list(self.idx_word_dict.keys())[list( self.idx_word_dict.values()).index(word)] except: return 3 def _idx_to_word(self, idx) -> str: return self.idx_word_dict[idx]
class KoreanTokenizer: ''' A class to tokenize a Korean sentence. Attributes ---------- pre_trained : bool | If True, one of pre-trained Korean analyzer, provided by KoNLPy, will be used (default : True) | If False, unsupervised KoreanTokenizer is initialized, based on soynlp L-Tokenizer. Argument 'anaylzer' is ignored. analyzer : str | Type of KoNLPy analyzer (default : Hannanum) | Available analyzers are: Hannanum, Kkma, Komoran, Mecab, Okt | Note: Mecab needs to be installed separately before being used. Methods ------- train | Trains KoreanTokenizer on a corpus, only when 'pre_trained' argument is False. tokenize | Tokenizes the input sentence and returns its tokens. extract_noun | Extracts nouns from the input sentence. ''' def __init__(self, pre_trained=True, analyzer='Hannanum'): self.pre_trained = pre_trained if analyzer == 'Hannanum': self.analyzer = tag.Hannanum() elif analyzer == 'Kkma': self.analyzer = tag.Kkma() elif analyzer == 'Komoran': self.analyzer = tag.Komoran() elif analyzer == 'Mecab': self.analyzer = tag.Mecab() elif analyzer == 'Okt': self.analyzer = tag.Okt() else: if pre_trained == False: pass else: print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt') self.WordExtractor = WordExtractor(min_frequency=0) self.noun_extractor = LRNounExtractor(verbose=False) self.word_score = {} def train(self, text): ''' A method to train the KoreanTokenizer on a corpus. If KoreanTokenizer.pre_trained == False, this method does nothing. Attributes ---------- text : str | An input text in str type ''' if self.pre_trained == True: print('A pre-trained KoreanTokenizer is being used. No need to train it.') return else: self.WordExtractor.train(text) self.words = self.WordExtractor.extract() def calculate_word_score(word, score): cohesion = score.cohesion_forward branching_entropy = score.right_branching_entropy word_score = cohesion * exp(branching_entropy) return word_score self.word_score = {word:calculate_word_score(word, score) for word, score in self.words.items()} def tokenize(self, text): ''' A method to tokenize input text. Attriubutes ----------- text : str | An input text to be tokenized Output ------ tokens : list | List of tokens (in str) that consist of the input text ''' if self.pre_trained == True: return self.analyzer.morphs(text) else: if not self.word_score: print('An unsupervised KoreanTokenizer should be trained first, before tokenizing.') return self.tokenizer = LTokenizer(scores=self.word_score) result = self.tokenizer.tokenize(text) return result def extract_noun(self, text): ''' A method to extract nouns from input text Attributes ---------- text : str | An input text from which nouns will be extracted Output ------ nouns : list | List of noun tokens (in str) in the input text ''' if self.pre_trained == True: return self.analyzer.nouns(text)
class Tokenizer: """Tokenizer class""" def __init__(self): # load noun cohesion score with open('utils/words.p', 'rb') as rf: words = pickle.load(rf) cohesion_score = { word: score.cohesion_forward for word, score in words.items() } cohesion_score = { k: v for k, v in sorted(cohesion_score.items(), key=lambda item: item[1], reverse=True) if v > 0 } with open('utils/nouns.p', 'rb') as rf: nouns = pickle.load(rf) noun_score = {noun: score.score for noun, score in nouns.items()} noun_cohesion_score = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_score.items() } self._noun_cohesion_score = { k: v for k, v in sorted(noun_cohesion_score.items(), key=lambda item: item[1], reverse=True) if v > 0 } self._soy = LTokenizer(scores=self._noun_cohesion_score) self._is_flatten = False # no_flatten self._is_remove_r = False # no_remove self._emo = get_emoji_regexp() # re compiled def _preprocess(self, doc: str) -> str: """전처리 로직""" doc = str(doc).lower().strip() # make string, lower and strip doc = re.sub(rf'([^{self._emo.pattern}{string.punctuation}\s\w]+)', ' ', doc) # 숫자, 문자, whitespace, 이모지, 일반특수문자를 제외한 모든 유니코드 제거. doc = re.sub(r'\s', ' ', doc) #white space character 변환 doc = re.sub(' ', ' ', doc) # 제거 doc = re.sub('<', '<', doc) #기타 html특수기호 doc = re.sub('>', '>', doc) #기타 html특수기호 doc = re.sub('&', '&', doc) #기타 html특수기호 doc = re.sub('"', '""', doc) #기타 html특수기호 doc = re.sub(r'(http\S+[^가-힣])|([a-zA-Z]+.\S+.\S+[^가-힣])', r' [URL] ', doc) #url 변환 doc = re.sub(r'(\[image#0\d\])', r' [IMAGE] ', doc) # Image Tag doc = re.sub(r'([0-9a-zA-Z_]|[^\s^\w])+(@)[a-zA-Z]+.[a-zA-Z)]+', r' [EMAIL] ', doc) #email doc = re.sub(r'#(\w+)', r' [HASHTAG] ', doc) #Hashtag doc = re.sub(r'@(\w+)', r' [MENTION] ', doc) #MENTION doc = emojize(demojize(doc, delimiters=(' :', ': '))).strip() return doc def _postprocess(self, doc: List[str]) -> List[Tuple[str]]: """후처리 로직""" processed_doc = [] for l_part, r_part in doc: ## l_part l_part = repeat_normalize(l_part, num_repeats=3) sub_l_part = re.findall(r"[\w]+|[\W]+", l_part) if len(sub_l_part) == 2: processed_doc += [(sub, 'L') for sub in sub_l_part] else: processed_doc.append((l_part, 'L')) ## r_part if r_part != '': r_part = repeat_normalize(r_part, num_repeats=3) sub_r_part = re.findall(r"[\w]+|[\W]+", r_part) if len(sub_r_part) == 2: processed_doc += [(sub, 'R') for sub in sub_r_part] else: processed_doc.append((r_part, 'R')) return processed_doc def tokenize(self, doc: str, media_type: str = None) -> List[Tuple[str]]: """tokenize function Use noun cohesion score with soynlp doc : media_type : """ doc = self._soy.tokenize( self._preprocess(doc), flatten=self._is_flatten, remove_r=self._is_remove_r) # returns list of tuple doc = self._postprocess(doc) return doc
return s etri_processed_data["title"] = etri_processed_data["title"].progress_apply(concat_text_with_pos) word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model' word_extractor.load(soynlp_model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soyToken = LTokenizer(scores=scores) # soyToken.tokenize(soynlp_processed_data["title"].values[0]) soynlp_processed_data["title"] = soynlp_processed_data["title"].progress_apply(lambda x: " ".join(soyToken.tokenize(x))) token = spm.SentencePieceProcessor() token.Load("./backend/textengines/data/tokenizer_model/sentencepice.model") spm_processed_data["title"] = spm_processed_data["title"].progress_apply(lambda x: " ".join(token.EncodeAsPieces(x))) ############################################################################# td = etri_processed_data.copy() ratio_train = 0.8 ratio_val = 0.1 ratio_test = 0.1 # Produces test split. x_remaining, x_test, y_remaining, y_test = train_test_split( td["title"],
from soynlp.tokenizer import LTokenizer scores = { '날씨': 0.5, '맑다': 0.5, '흐리다': 0.5, '흐림': 0.45, '오늘': 0.4, '내일': 0.4, '대체로': 0.2, '것': 0.01 } tokenizer = LTokenizer(scores=scores) sent = '오늘의 날씨는 대체로 맑고, 내일의 날씨는 흐릴 것이다.' print(tokenizer.tokenize(sent, flatten=False)) print(tokenizer.tokenize(sent))
word_extractor = WordExtractor(min_frequency=150, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(x_train) train_words = word_extractor.extract() train_score = { word: score.cohesion_forward for word, score in train_words.items() } tokenizer = LTokenizer(scores=train_score) train_list = [] cnt = 0 for sent in x_train: train_list.append([tokenizer.tokenize(sent), y_train[cnt]]) cnt += 1 word_extractor.train(x_test) test_words = word_extractor.extract() test_score = { word: score.cohesion_forward for word, score in test_words.items() } tokenizer = LTokenizer(scores=test_score) test_list = [] cnt = 0 for sent in x_test: test_list.append([tokenizer.tokenize(sent)]) cnt += 1
def Makegraph_Wordcloud_Soynlp(target): try: if flag_login == 0 or flag_login == None or flag_login == '': Login() #elif flag_prepro == 0: #messagebox.showwarning('주의', '데이터 전처리 후 실행해주세요.') #return else: data_wordcloud_soynlp = pd.DataFrame(data_origin[target], columns=['contents']) data_wordcloud_soynlp['contents'] = data_origin[target].apply( lambda x: re.sub('[^가-힣]', ' ', x)) word_extractor = WordExtractor( min_frequency=10, # 가변화하기 (ex. data_origin.len() 비례) min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(data_wordcloud_soynlp['contents'].values) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # force : 여기인가? # force join words cohesion_score['숙소제공'] = 1 cohesion_score['교통비지급'] = 1 cohesion_score['인센티브'] = 1 cohesion_score['초과근무시간확대'] = 1 cohesion_score['복지포인트'] = 1 cohesion_score['인사우대'] = 1 cohesion_score['근평가점'] = 1 cohesion_score['주거이전수당'] = 1 tokenizer = LTokenizer(scores=cohesion_score) data_wordcloud_soynlp['tokenizer'] = data_wordcloud_soynlp[ 'contents'].apply( lambda x: tokenizer.tokenize(x, remove_r=True)) words = list() for i in data_wordcloud_soynlp['tokenizer'].values: for j in i: words.append(j) count_soynlp = Counter(words) words_dict_soynlp = dict(count_soynlp.most_common(100)) # 빈도 상위 n개 csv_stopwords = pd.read_csv('stopwords.csv', encoding='cp949', skiprows=0) # with open 변경 stopwords = list() for i in csv_stopwords.values: for j in i: stopwords.append(j) for word in stopwords: words_dict_soynlp.pop(word, None) wordcloud = WordCloud( font_path='NanumGothic.ttf', width=500, height=500, background_color='white').generate_from_frequencies( words_dict_soynlp) plt.clf() plt.figure(figsize=(20, 20)) plt.imshow(wordcloud) plt.axis('off') #plt.show() plt.savefig(resultdir + filename_dateflag + target + ' - wordcloud_soynlp.png', dpi=100) ''' # 빈도그래프(temp) plt.clf() plt.style.use('ggplot') plt.figure(figsize = (len(list(words_dict_soynlp.keys())[:20])*0.6, 10)) # grid size 가변화 plt.title('상위 10개 빈출단어') plt.bar(list(words_dict_soynlp.keys())[:20], list(words_dict_soynlp.values())[:20]) plt.xticks(rotation = 45, ha = 'right') # x축 라벨 회전 plt.savefig(resultdir + filename_dateflag + target + ' - wordfrequency.png', dpi = 200) ''' messagebox.showinfo( '작업', '워드클라우드(Soynlp) 생성이 완료되었습니다.\n\nresult폴더에 결과물이 저장되었습니다.') except Exception as e: Log(desc=e) messagebox.showerror('경고', str(e) + ' 열을 찾을 수 없습니다.')
word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(train_data) train_words = word_extractor.extract() train_score = { word: score.cohesion_forward for word, score in train_words.items() } tokenizer = LTokenizer(scores=train_score) train_list = [] cnt = 0 for sent in train_data: train_list.append([tokenizer.tokenize(sent), train_label[cnt]]) cnt += 1 word_extractor.train(test_data) test_words = word_extractor.extract() test_score = { word: score.cohesion_forward for word, score in test_words.items() } tokenizer = LTokenizer(scores=test_score) test_list = [] cnt = 0 for sent in test_data: test_list.append([tokenizer.tokenize(sent), test_label[cnt]]) cnt += 1