Esempio n. 1
0
def tokenize_sample_test():
    from konlpy.tag import Kkma, Okt, Hannanum, Komoran
    seqs = [
        '웹 게시글 페이징 하려고 알아본건데. 알려주실 수 있을까요? ``` <게시글 검색 쿼리> String sql = "SELECT * from ( "; sql +=" SELECT ',
        ' '.join('''''
                            <script type=""text/$$$"">
                        (function () {
                            window.item_list = {{item_list|safe}};
                        })();
                        $(""#search"").autocomplete({
                            source: window.item_list
                        });
                    </script>
        '''.split()),
        '[1]: https://res.cloudinary.com/eightcruz/image/upload/v1503145822/aniod28b6vzmc0jpebze.png"'
    ]

    for i, seq in enumerate(seqs):
        print(f'# test {i}')
        tokenizer = Okt()
        tokens = tokenizer.morphs(seq)
        print('Okt:', len(tokens), tokens)

        tokenizer = Kkma()
        tokens = tokenizer.morphs(seq)
        print('Kkma:', len(tokens), tokens)

        tokenizer = Hannanum()
        tokens = tokenizer.morphs(seq)
        print('Hannanum:', len(tokens), tokens)

        # tokenizer = Komoran()
        # tokens = tokenizer.morphs(seq)
        # print('Komoran:', len(tokens), tokens)

        tokenizer = spm.SentencePieceProcessor()
        tokenizer.Load('{}.model'.format('vocab_2500'))
        tokens = tokenizer.EncodeAsPieces(seq)
        print('SP_2.5k:', len(tokens), tokens)

        tokenizer = spm.SentencePieceProcessor()
        tokenizer.Load('{}.model'.format('vocab_5000'))
        tokens = tokenizer.EncodeAsPieces(seq)
        print('SP_5k:', len(tokens), tokens)

        tokenizer = spm.SentencePieceProcessor()
        tokenizer.Load('{}.model'.format('vocab_10000'))
        tokens = tokenizer.EncodeAsPieces(seq)
        print('SP_10k:', len(tokens), tokens)

        tokenizer = spm.SentencePieceProcessor()
        tokenizer.Load('{}.model'.format('vocab_20000'))
        tokens = tokenizer.EncodeAsPieces(seq)
        print('SP_20k:', len(tokens), tokens)
        print('')
Esempio n. 2
0
class Tokenizer:
    def __init__(self, tokenizer_name, prefix=None):
        self.tokenizer = None
        self._load_tokenizer(tokenizer_name, prefix)

        if not isinstance(self.tokenizer, spm.SentencePieceProcessor):
            with open('vocabulary.txt', 'r', encoding='utf-8') as f:
                self.vocabulary = f.read().splitlines()

            self.except_words = ['http', '![']
            self.replace_words = [
                'javascript', 'java', 'android', 'org', 'python'
            ]

    def _load_tokenizer(self, tokenizer_name, prefix='vocab_5000'):
        if tokenizer_name == 'sentencepiece':
            make_tokenizer('train.txt', vocab_size=vocab_size)
            self.tokenizer = spm.SentencePieceProcessor()
            self.tokenizer.Load('{}.model'.format(prefix))
        else:
            from konlpy.tag import Okt
            self.tokenizer = Okt()

    def _preprocessing(self, x):
        f_x = []
        for w in x.split():
            w = w.lower()
            for ew in self.except_words:
                if ew in w:
                    break
            else:
                for rw in self.replace_words:
                    if rw in w:
                        f_x.append(rw)
                        break
                else:
                    f_x.append(w)
        return ' '.join(f_x)

    def tokenize(self, x):
        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
            x = self.tokenizer.EncodeAsPieces(x)
        else:
            x = self._preprocessing(x)
            x = self.tokenizer.morphs(x)
            for i, w in enumerate(x):
                if w not in self.vocabulary:
                    x[i] = '[UNK]'
        return x

    def tokens_to_ids(self, tokens):
        if not isinstance(tokens, list):
            tokens = tokens.split()
        ids = list(map(self.token_to_id, tokens))
        return ids

    def token_to_id(self, token):
        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
            id = self.tokenizer.PieceToId(token)
        else:
            if token in self.vocabulary:
                id = self.vocabulary.index(token)
            else:
                id = self.vocabulary.index('[UNK]')
        return id

    def get_tokens(self, vocab_prefix=None, for_masking=True):
        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
            with open('{}.vocab'.format(vocab_prefix), 'r',
                      encoding='utf-8') as f:
                tokens = [doc.strip().split("\t")[0] for doc in f]
        else:
            tokens = self.vocabulary[:]

        if for_masking:
            for special in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']:
                tokens.remove(special)
        return tokens