def __init__(self): # load noun cohesion score with open('utils/words.p', 'rb') as rf: words = pickle.load(rf) cohesion_score = { word: score.cohesion_forward for word, score in words.items() } cohesion_score = { k: v for k, v in sorted(cohesion_score.items(), key=lambda item: item[1], reverse=True) if v > 0 } with open('utils/nouns.p', 'rb') as rf: nouns = pickle.load(rf) noun_score = {noun: score.score for noun, score in nouns.items()} noun_cohesion_score = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_score.items() } self._noun_cohesion_score = { k: v for k, v in sorted(noun_cohesion_score.items(), key=lambda item: item[1], reverse=True) if v > 0 } self._soy = LTokenizer(scores=self._noun_cohesion_score) self._is_flatten = False # no_flatten self._is_remove_r = False # no_remove self._emo = get_emoji_regexp() # re compiled
def tokenizer_test(): from soynlp.tokenizer import LTokenizer from soynlp.tokenizer import MaxScoreTokenizer from soynlp.tokenizer import RegexTokenizer regex_tokenizer = RegexTokenizer() if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']): raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format( regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!'))) ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이'))) if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == ['데이터', '는', '데이터센터', '의', '데이', '데이']): raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format( ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05))) maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38}) if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == ['데이터', '는', '데이터', '센터의', '데이', '데이']): raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format( maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이'))) print('all tokenizer tests have been successed\n\n')
def tokenize(self, text): ''' A method to tokenize input text. Attriubutes ----------- text : str | An input text to be tokenized Output ------ tokens : list | List of tokens (in str) that consist of the input text ''' if self.pre_trained == True: return self.analyzer.morphs(text) else: if not self.word_score: print('An unsupervised KoreanTokenizer should be trained first, before tokenizing.') return self.tokenizer = LTokenizer(scores=self.word_score) result = self.tokenizer.tokenize(text) return result
def tokenize(self, text, **kwargs): ''' A method to tokenize the input text Attributes ---------- text : str | An input text in str type **kwargs | Keyword arguments for LTokenizer.tokenize() method (see soynlp.tokenizer.LTokenizer.tokenize) ''' if 'sentence' in kwargs.keys(): del kwargs['sentence'] print("WARNING: 'sentence' argument is ignored; word_tokenizer tokenizes 'text' argument only.") if not self.word_score: print('KoreanTokenizer should be trained first, before tokenizing.') return self.tokenizer = LTokenizer(scores=self.word_score) result = self.tokenizer.tokenize(text, **kwargs) return result
def data_tokenize(news_title, tdm_vocab): word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_score) cluster_data = [] bert_null_list = [] for title in news_title: title = test(title) sent = tokenizer.tokenize(title, flatten=False) sentence = [] for i in sent: if i[0] in tdm_vocab: sentence.append(i[0]) cluster_data.append(sentence) return cluster_data
def prediction(text): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) eos_idx = eng.vocab.stoi['<eos>'] # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(text) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to(params.device) # [1, source_len]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) # [1, max_len] encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): if next_symbol == eos_idx: break target[0][i] = next_symbol decoder_output, _ = model.decoder(target, source, encoder_output) # [1, target length, output dim] prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() #eos_idx = torch.where(target[0] == eos_idx)[0][0] #eos_idx = eos_idx.item() eos_index = 34 print(eos_idx) target = target[0][:eos_idx].unsqueeze(0) # translation_tensor = [target length] filed with word indices target, attention_map = model(source, target) target = target.squeeze(0).max(dim=-1)[1] reply_token = [eng.vocab.itos[token] for token in target if token != 3] print(reply_token) #translation = translated_token[:translated_token.index('<eos>')] #translation = ''.join(translation) reply = ' '.join(reply_token) #print(reply) #display_attention(tokenized, reply_token, attention_map[4].squeeze(0)[:-1]) return reply
def _extracte(self) -> None: self.extractor = WordExtractor() self.extractor.train(self.corpus) self.words = self.extractor.extract() self.cohesion_score = { word: score.cohesion_forward for word, score in self.words.items() } self.tokenizer = LTokenizer(scores=self.cohesion_score)
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 10 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue soy_tokens = soy_tokenizer.tokenize(word) if ' '.join(tokens) == ' '.join(soy_tokens): continue if is_all_nng(mcab.pos(word)): #print("nouns only : {}".format(word)) #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) continue if len(soy_tokens) > 1: continue #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) words = re.findall(' '.join(tokens), sentences) if len(words) < (cnt * 0.05): # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong)) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
def predict(config): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() input = clean_text(config.input) # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(input) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to( params.device) # [1, source length]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): target[0][i] = next_symbol dec_output = model.decoder(target, source, encoder_output) # dec_output = [1, target length, output dim] prob = dec_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() # translation_tensor = [target length] filed with word indices target = model(source, target) target = torch.argmax(target.squeeze(0), -1) # target = target.squeeze(0).max(dim=-1, keepdim=False) translation = [eng.vocab.itos[token] for token in target][1:] translation = ' '.join(translation) print(f'kor> {config.input}') print(f'eng> {translation.capitalize()}')
def __init__(self, model_path: str = None): self.word_extractor = WordExtractor(min_frequency=5, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) self.unk = 0 self.pad = 1 self.sos = 2 self.eos = 3 if model_path: with open(model_path, 'rb') as readFile: self.cohesion_score = dill.load(readFile) else: self.cohesion_score = {} self.tokenizer = LTokenizer(scores=self.cohesion_score) self.tok_to_id, self.id_to_tok = self._build_dict()
def soynlp_tokenizer(corpus): from soynlp.tokenizer import LTokenizer from soynlp.word import WordExtractor from soynlp.noun import LRNounExtractor_v2 # word extractor word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(corpus) words = word_extractor.extract() cohesion_score = { word: score.cohesion_forward for word, score in words.items() } # noun extractor noun_extractor = LRNounExtractor_v2() nouns = noun_extractor.train_extract(corpus) # list of str like noun_scores = {noun: score.score for noun, score in nouns.items()} combined_scores = { noun: score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items() } combined_scores.update({ subword: cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores) }) tokenizer = LTokenizer(scores=combined_scores) return tokenizer
def pmi_test(corpus_path): print('PMI test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True) for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]: pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]]) print('pmi {} = {:.3f}'.format(pair_, pmi)) print('computed PMI')
def getTokenizer(self, contents): corpus = SentiCorpus(contents, iter_sent=True) word_extractor = WordExtractor(corpus) word_extractor.train(corpus) words_scores = word_extractor.extract() scores = {w: s.cohesion_forward for w, s in words_scores.items()} return LTokenizer(scores=scores)
class LTokenizerKorean(SpecialTokenizer): # 어근 중심 tokenizer def __init__(self, scores=None): from soynlp.tokenizer import LTokenizer self.inst=LTokenizer(scores=scores) # scores를 preference로 지정할 수 있고, 지정하지 않으면 cohesion score로 알아서 계산됨 self.OUT_TYPE = [list, str] def __call__(self, *args, **kwargs): tokens = self.inst.tokenize(args[0]) return tokens
def select_tokenizer(model): if model == Okt: tokenizer = Okt() tokenized = tr['document'].apply(tokenizer.morphs).tolist() if model == LTokenizer: tokenizer = LTokenizer() tokenized = tr['document'].apply(tokenizer.morphs).tolist() return tokenized
class LTokenizerKorean(SpecialTokenizer): def __init__(self, scores=None): from soynlp.tokenizer import LTokenizer self.inst = LTokenizer(scores=scores) self.OUT_TYPE = [list, str] def __call__(self, *args, **kwargs): tokens = self.inst.tokenize(args[0]) return tokens
def soy_tokenize(model_fname, input_sentence): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) tokens = tokenizer.tokenize(input_sentence) tokenized_sent = ' '.join(tokens) return tokenized_sent
def predict_cnn(config): # load tokenizer and torchtext Field pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_vocab = open('pickles/text.pickle', 'rb') text = pickle.load(pickle_vocab) model = CNN(config) model.load_state_dict(torch.load(config.save_model)) model.to(device) model.eval() tokenized = tokenizer.tokenize(config.input) min_len = config.filter_sizes[-1] # if user's input sentence is shorter than the largest filter size, add pad tokens to input sentence if len(tokenized) < min_len: tokenized += ['<pad>'] * (min_len - len(tokenized)) indexed = [text.vocab.stoi[token] for token in tokenized] length = [len(indexed)] tensor = torch.LongTensor(indexed).to(device) tensor = tensor.unsqueeze(1) length_tensor = torch.LongTensor(length) prediction = torch.sigmoid(model(tensor, length_tensor)) label = torch.round(prediction) if label == 1: label = 'Positive' else: label = 'Negative' sentiment_percent = prediction.item() print(f'[in] >> {config.input}') print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
def predict_sequential(config): # load tokenizer and torchtext Field pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_vocab = open('pickles/text.pickle', 'rb') text = pickle.load(pickle_vocab) pad_idx = text.vocab.stoi[text.pad_token] model_type = { 'vanilla_rnn': RNN(config, pad_idx), 'bidirectional_lstm': BidirectionalLSTM(config, pad_idx), } # select model and load trained model model = model_type[config.model] model.load_state_dict(torch.load(config.save_model)) model.eval() # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(config.input) indexed = [text.vocab.stoi[token] for token in tokenized] length = [len(indexed)] tensor = torch.LongTensor(indexed).to(device) # [input length] tensor = tensor.unsqueeze( 1) # [input length, 1] for adding batch dimension length_tensor = torch.LongTensor(length) prediction = torch.sigmoid(model(tensor, length_tensor)) label = torch.round(prediction) if label == 1: label = 'Positive' else: label = 'Negative' sentiment_percent = prediction.item() print(f'[in] >> {config.input}') print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
def soy_tokenizer(ext_type = 'noun'): # 파일 불러오기 with open(r'.\Model\Extractor\nouns.bin', 'rb') as f: nouns = pickle.load(f) with open(r'.\Model\Extractor\words.bin', 'rb') as f: words = pickle.load(f) noun_scores = {noun:score.score for noun, score in nouns.items()} cohesion_score = {word:score.cohesion_forward for word, score in words.items()} combined_scores = {noun:score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items()} combined_scores.update( {subword:cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores)} ) if ext_type == 'noun': return LTokenizer(scores = noun_scores) elif ext_type == 'word': return LTokenizer(scores = cohesion_score) elif ext_type == 'comb': return LTokenizer(scores = combined_scores)
def content_to_token(text_file_name): print("opening file " + text_file_name) with open(text_file_name, 'r', encoding="utf-8") as f: lines = f.read().splitlines() re.sub(r"[\[\]<>~]", ' ', lines[0]) re.sub(r"['~]", ' ', lines[0]) re.sub(r'"', ' ', lines[0]) text = [] for line in lines: line = re.sub(r"[\[\]<>~]", ' ', line) line = re.sub(r"['~]", ' ', line) line = re.sub(r'"', ' ', line) line = re.sub('\\W', ' ', line) text.append(line) ltokenizer = LTokenizer(scores=scores_dictionary) print("making list of words") words = [] for sent in text: conclude_sent = [] #flatten을 False로 주어서 [L명사, R조사]형태로 분류하게 만듦. pre_list = ltokenizer.tokenize(sent, flatten=False) for LR_list in pre_list: word = LR_list[0] if word in word_dict: word = word_dict[word] if word not in exception_list: conclude_sent.append(word) words.append(conclude_sent) token_file_name = text_file_name[:-4] + '.csv' f = open(token_file_name, 'w', newline="") wr = csv.writer(f) for word in words: wr.writerow(word) f.close()
def pad_sentence(dataframe, min_len): """ to use CNN, all the inputs has the minimum length as same as the largest filter size if the input is shorter than the largest CNN filter size, we should pad that input using pad_sentence method Args: dataframe: (DataFrame) dataframe used to train and validate the model min_len: (integer) the largest CNN filter size used to set the minimum length of the model Returns: """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) for i, row in dataframe.iterrows(): tokenized = tokenizer.tokenize(row.document) if len(tokenized) < min_len: tokenized += ['<pad>'] * (min_len - len(tokenized)) padded_sent = ' '.join(tokenized) dataframe.at[i, 'document'] = padded_sent return dataframe
def soy_tokenize(corpus_fname, model_fname, output_fname): word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고, # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다 scores = { key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys() } tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() normalized_sent = emoticon_normalize(sentence, num_repeats=3) tokens = tokenizer.tokenize(normalized_sent) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
def build_vocab(config): """ 위에서 얻은 score를 이용하여 vocab을 만든다. """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) """ tokenizer 중 cohesion score를 기준으로 단어를 구분하는 LTokenizer을 사용한다. 한국어 어절을 '명사/동사/형용사/부사'(L part) + '조사 등'(R part)으로 보고, 의미가 핵심적인 L part의 점수를 도출한다. """ # Field를 통해 단어를 tokenize하고 tensor로 바꾼다. # Field에 대한 다양한 parameter에 대한 정보는 https://torchtext.readthedocs.io/en/latest/data.html 에서 얻을 수 있다. kor = ttd.Field(tokenize=tokenizer.tokenize, lower=True, batch_first=True) # 영어를 tokenize하는 함수는 spacy이다. 이후 항상 첫 token은 <sos>, 마지막 token은 <eos>로 지정한다. eng = ttd.Field(tokenize='spacy', init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'train.csv') train_data = pd.read_csv(train_file, encoding='utf-8') train_data = convert_to_dataset(train_data, kor, eng) print(f'Build vocabulary using torchtext . . .') # 읽어 온 data를 한국어는 한국어 토큰으로, 영어는 영어 토큰으로 나누어 저장한다. kor.build_vocab(train_data, max_size=config.kor_vocab) eng.build_vocab(train_data, max_size=config.eng_vocab) # unique token 개수를 출력한다. print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}') print(f'Unique tokens in English vocabulary: {len(eng.vocab)}') # 가장 많이 쓰인 한국어/영어 단어를 출력한다. print(f'Most commonly used Korean words are as follows:') print(kor.vocab.freqs.most_common(20)) print(f'Most commonly used English words are as follows:') print(eng.vocab.freqs.most_common(20)) # 생성된 한국어/영어 vocab을 pickle로 저장한다 with open('pickles/kor.pickle', 'wb') as kor_file: pickle.dump(kor, kor_file) with open('pickles/eng.pickle', 'wb') as eng_file: pickle.dump(eng, eng_file)
def data_tokenize(news_title): word_extractor = WordExtractor( min_frequency=100, # example min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.train(news_title) words = word_extractor.extract() cohesion_score = {word:score.cohesion_forward for word, score in words.items()} tokenizer = LTokenizer(scores=cohesion_score) return tokenizer
def build_vocab(config): """ Build vocabulary used to convert input sentence into word indices using soynlp and spacy tokenizer Args: config: configuration containing various options Returns: """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) # include lengths of the source sentences to use pack pad sequence kor = ttd.Field(tokenize=tokenizer.tokenize, lower=True, include_lengths=True) eng = ttd.Field(tokenize='spacy', init_token='<sos>', eos_token='<eos>', lower=True) data_dir = Path().cwd() / 'data' train_file = os.path.join(data_dir, 'train.csv') train_data = pd.read_csv(train_file, encoding='utf-8') train_data = convert_to_dataset(train_data, kor, eng) print(f'Build vocabulary using torchtext . . .') kor.build_vocab(train_data, max_size=config.kor_vocab) eng.build_vocab(train_data, max_size=config.eng_vocab) print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}') print(f'Unique tokens in English vocabulary: {len(eng.vocab)}') print(f'Most commonly used Korean words are as follows:') print(kor.vocab.freqs.most_common(20)) print(f'Most commonly used English words are as follows:') print(eng.vocab.freqs.most_common(20)) with open('pickles/kor.pickle', 'wb') as kor_file: pickle.dump(kor, kor_file) with open('pickles/eng.pickle', 'wb') as eng_file: pickle.dump(eng, eng_file)
def _get_tokenizer(self, df): """ Generate a torkenizer by extracting words Args: dataframe: data corpus of one language Returns: tokenizer """ word_extractor = WordExtractor() word_extractor.train(df) words = word_extractor.extract() print(f'length of words is {len(words)}') cohesion_scores = { word: score.cohesion_forward for word, score in words.items() } tokenizer = LTokenizer(scores=cohesion_scores) return tokenizer
def pmi_test(corpus_path): print('pmi test\n{}'.format('-' * 40)) from soynlp import DoublespaceLineCorpus from soynlp.word import WordExtractor from soynlp.tokenizer import LTokenizer from soynlp.vectorizer import sent_to_word_contexts_matrix from soynlp.word import pmi corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True) print('num sents = {}'.format(len(corpus))) word_extractor = WordExtractor() word_extractor.train(corpus) cohesions = word_extractor.all_cohesion_scores() l_cohesions = {word: score[0] for word, score in cohesions.items()} tokenizer = LTokenizer(l_cohesions) print('trained l tokenizer') x, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, tokenizer=tokenizer, # (default) lambda x:x.split(), dynamic_weight=False, verbose=True) x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001) rows, cols = x_pmi.nonzero() data = x_pmi.data print('row shape = {}'.format(rows.shape)) print('col shape = {}'.format(cols.shape)) print('data shape = {}'.format(data.shape)) for indpt in data.argsort()[-150:-100]: i = rows[indpt] j = cols[indpt] pair = (idx2vocab[i], idx2vocab[j]) value = data[indpt] print('pmi {} = {:.3f}'.format(pair, value)) print('computed pmi')
def build_vocab(config): """ Build vocab used to convert Korean input sentence into word indices using soynlp tokenizer Args: config: configuration object containing various options Returns: """ pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) # To use packed padded sequences, tell the model how long the actual sequences are by 'include_lengths=True' text = ttd.Field(tokenize=tokenizer.tokenize, include_lengths=True) label = ttd.LabelField(dtype=torch.float) data_dir = Path().cwd() / 'data' train_txt = os.path.join(data_dir, 'train.txt') train_data = pd.read_csv(train_txt, sep='\t') train_data, valid_data = train_test_split(train_data, test_size=0.3, random_state=32) train_data = convert_to_dataset(train_data, text, label) print(f'Building vocabulary using torchtext . . .') text.build_vocab(train_data, max_size=config.vocab_size) label.build_vocab(train_data) print(f'Unique tokens in TEXT vocabulary: {len(text.vocab)}') print(f'Unique tokens in LABEL vocabulary: {len(label.vocab)}') print(f'Most commonly used words are as follows:') print(text.vocab.freqs.most_common(20)) file_text = open('pickles/text.pickle', 'wb') pickle.dump(text, file_text) file_label = open('pickles/label.pickle', 'wb') pickle.dump(label, file_label)
def train_extractor(begin_d=None, end_d=None, sections: list = None, base_dir='./out', tokenizer=None): _, sentences, corpus_class = make_corpus(begin_d=begin_d, end_d=end_d, sections=sections, base_dir=base_dir) # nouns = get_noun_words(begin_d='20201101', end_d='20201130') noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) # list of str like noun_score = dict([(key, val.score) for key, val in nouns.items()]) if tokenizer is None: tokenize = lambda x: x.strip().split() elif tokenizer == 'max_score_tokenizer': tokenize = MaxScoreTokenizer(noun_score) elif tokenizer == 'ltokenizer': tokenize = LTokenizer(noun_score) else: raise NotImplementedError if sections is not None and len(sections) >= 1: min_tf = 10 min_df = 2 else: min_tf = 20 min_df = 2 keyword_extractor = CorpusbasedKeywordExtractor( min_tf=min_tf, min_df=min_df, # tokenize=lambda x: x.strip().split(), tokenize=tokenize, verbose=True) # docs: list of str like keyword_extractor.train(sentences) return keyword_extractor, nouns, corpus_class
def check_morphs(lst, corpus_fname, output_fname, log_fname): mcab = mecab.MeCab() model_fname = 'soyword.model' word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.load(model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} soy_tokenizer = LTokenizer(scores=scores) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2, \ open(log_fname, 'w', encoding='utf-8') as f3: sentences = f1.read() for item in lst: cnt, word = item if cnt < 100 or len(word) == 1: continue tokens = mcab.morphs(word) if len(tokens) == 1: continue (cho, jung, jong) = hgtk.letter.decompose(word[-1]) if 'ㄱ' <= jong <= 'ㅎ': dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) else: dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) f2.writelines(dic_line + '\n') f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')