def tokenize_sample_test(): from konlpy.tag import Kkma, Okt, Hannanum, Komoran seqs = [ '웹 게시글 페이징 하려고 알아본건데. 알려주실 수 있을까요? ``` <게시글 검색 쿼리> String sql = "SELECT * from ( "; sql +=" SELECT ', ' '.join(''''' <script type=""text/$$$""> (function () { window.item_list = {{item_list|safe}}; })(); $(""#search"").autocomplete({ source: window.item_list }); </script> '''.split()), '[1]: https://res.cloudinary.com/eightcruz/image/upload/v1503145822/aniod28b6vzmc0jpebze.png"' ] for i, seq in enumerate(seqs): print(f'# test {i}') tokenizer = Okt() tokens = tokenizer.morphs(seq) print('Okt:', len(tokens), tokens) tokenizer = Kkma() tokens = tokenizer.morphs(seq) print('Kkma:', len(tokens), tokens) tokenizer = Hannanum() tokens = tokenizer.morphs(seq) print('Hannanum:', len(tokens), tokens) # tokenizer = Komoran() # tokens = tokenizer.morphs(seq) # print('Komoran:', len(tokens), tokens) tokenizer = spm.SentencePieceProcessor() tokenizer.Load('{}.model'.format('vocab_2500')) tokens = tokenizer.EncodeAsPieces(seq) print('SP_2.5k:', len(tokens), tokens) tokenizer = spm.SentencePieceProcessor() tokenizer.Load('{}.model'.format('vocab_5000')) tokens = tokenizer.EncodeAsPieces(seq) print('SP_5k:', len(tokens), tokens) tokenizer = spm.SentencePieceProcessor() tokenizer.Load('{}.model'.format('vocab_10000')) tokens = tokenizer.EncodeAsPieces(seq) print('SP_10k:', len(tokens), tokens) tokenizer = spm.SentencePieceProcessor() tokenizer.Load('{}.model'.format('vocab_20000')) tokens = tokenizer.EncodeAsPieces(seq) print('SP_20k:', len(tokens), tokens) print('')
class Tokenizer: def __init__(self, tokenizer_name, prefix=None): self.tokenizer = None self._load_tokenizer(tokenizer_name, prefix) if not isinstance(self.tokenizer, spm.SentencePieceProcessor): with open('vocabulary.txt', 'r', encoding='utf-8') as f: self.vocabulary = f.read().splitlines() self.except_words = ['http', '!['] self.replace_words = [ 'javascript', 'java', 'android', 'org', 'python' ] def _load_tokenizer(self, tokenizer_name, prefix='vocab_5000'): if tokenizer_name == 'sentencepiece': make_tokenizer('train.txt', vocab_size=vocab_size) self.tokenizer = spm.SentencePieceProcessor() self.tokenizer.Load('{}.model'.format(prefix)) else: from konlpy.tag import Okt self.tokenizer = Okt() def _preprocessing(self, x): f_x = [] for w in x.split(): w = w.lower() for ew in self.except_words: if ew in w: break else: for rw in self.replace_words: if rw in w: f_x.append(rw) break else: f_x.append(w) return ' '.join(f_x) def tokenize(self, x): if isinstance(self.tokenizer, spm.SentencePieceProcessor): x = self.tokenizer.EncodeAsPieces(x) else: x = self._preprocessing(x) x = self.tokenizer.morphs(x) for i, w in enumerate(x): if w not in self.vocabulary: x[i] = '[UNK]' return x def tokens_to_ids(self, tokens): if not isinstance(tokens, list): tokens = tokens.split() ids = list(map(self.token_to_id, tokens)) return ids def token_to_id(self, token): if isinstance(self.tokenizer, spm.SentencePieceProcessor): id = self.tokenizer.PieceToId(token) else: if token in self.vocabulary: id = self.vocabulary.index(token) else: id = self.vocabulary.index('[UNK]') return id def get_tokens(self, vocab_prefix=None, for_masking=True): if isinstance(self.tokenizer, spm.SentencePieceProcessor): with open('{}.vocab'.format(vocab_prefix), 'r', encoding='utf-8') as f: tokens = [doc.strip().split("\t")[0] for doc in f] else: tokens = self.vocabulary[:] if for_masking: for special in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']: tokens.remove(special) return tokens