class HuggingFaceTokenizer:
    def __init__(self, cache_dir, max_length=None, vocab_size=400):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.cache_dir = cache_dir
        self.name = "%d-%s" % (vocab_size, max_length)
        self.tokenizer = None

        vocab = os.path.join(self.cache_dir, self.name + '-vocab.json')
        merges = os.path.join(self.cache_dir, self.name + '-merges.txt')
        if os.path.exists(vocab) and os.path.exists(merges):
            self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True)
            print('Using cached HuggingFaceTokenizer')

    def build(self, texts):
        if self.tokenizer is not None:
            return

        tmp_file = tempfile.NamedTemporaryFile()

        with open(tmp_file.name, "w") as f:
            f.write(' '.join(texts).lower())

        self.tokenizer = CharBPETokenizer(lowercase=True)
        self.tokenizer.train(
            [tmp_file.name],
            vocab_size=self.vocab_size,
            special_tokens=[
                NUL_token,
                PAD_token,
                BOS_token,
                UNK_token,
            ],
        )
        os.makedirs(self.cache_dir, exist_ok=True)
        self.tokenizer.save(self.cache_dir, self.name)

    def encode(self, text):
        token_ids = self.tokenizer.encode(text.lower()).ids
        token_ids = token_ids[:self.max_length]

        return token_ids

    def decode(self, tokens, skip_special_tokens=True):
        text = self.tokenizer.decode(  # My special tokens
            tokens,
            # [token for token in tokens if token > 3],   # aren't skipped
            skip_special_tokens=skip_special_tokens,  # even I set f*****g
        )  # skip_special_tokens
        return text  # to True

    def decode_plus(self, token_batch):
        sentences = []
        for tokens in token_batch:
            sentences.append(self.decode(tokens))
        return sentences
Esempio n. 2
0
def test():
    """Test trained tokenizer"""

    tokenizer = CharBPETokenizer('./thyme-tokenizer-vocab.json',
                                 './thyme-tokenizer-merges.txt')

    vocab = tokenizer.get_vocab()
    print('vocab size:', len(vocab))

    encoded = tokenizer.encode('patient dr. who diagnosed with brain abc')
    encoded.pad(15)

    print('encoded:', encoded.ids)
    print('decoded:', tokenizer.decode(encoded.ids))

    print(encoded.tokens)
    print(encoded.attention_mask)
Esempio n. 3
0
class BPETokenizer:
    def __init__(self, text_list, vocab_size, lazy=False):
        if not lazy:
            self.tokenizer = CharBPETokenizer()
            self.tokenizer.train(text_list,
                                 vocab_size=vocab_size,
                                 special_tokens=[PAD, BOS, EOS, "<unk>"])
            self.tokenizer.add_special_tokens([PAD, BOS, EOS])
        else:
            self.tokenizer = None

    def tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(t) for t in tokens]

    def ids_to_tokens(self, ids):
        return [self.tokenizer.id_to_token(i) for i in ids]

    def encode(self, text):
        encodes = self.tokenizer.encode(text)
        return encodes.ids

    def decode(self, ids, skip_special=True):
        return self.tokenizer.decode(ids, skip_special_tokens=skip_special)

    def save(self, path, file_name):
        self.tokenizer.save(path, file_name)

    @classmethod
    def load(cls, vocab, merges):
        tkz = cls(None, None, lazy=True)
        tkz.tokenizer = CharBPETokenizer(vocab, merges)
        tkz.tokenizer.add_special_tokens([PAD, BOS, EOS])
        return tkz

    def __len__(self):
        return self.tokenizer.get_vocab_size()
Esempio n. 4
0
 def test_decoding(self, openai_files):
     tokenizer = CharBPETokenizer(openai_files["vocab"],
                                  openai_files["merges"],
                                  lowercase=True)
     decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
     assert decoded == "my name is john"
        special_tokens=[
            "<blank>",
            "<bos>",
            "<unk>",
        ],
    )

    # os.makedirs('./BPE-1000', exist_ok=True)
    tokenizer.save(f'./BPE-1000', '')

    tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json',
                                 './BPE-1000/-merges.txt')
    # with open('.test.pkl', 'w') as f:
    #     pickle.dump(tokenizer, f)

    tokenizer = HuggingFaceTokenizer()
    print(
        tokenizer.encode(
            'might have a solution it might take a long time nobody'))

    print(
        tokenizer.decode(
            tokenizer.encode(
                'might have a solution it might take a long time nobody'), ))

    # transforms = torchaudio.transforms.MFCC(n_mfcc=40)
    # concat = ConcatFeature()
    # waveform = transforms(data)
    # print(waveform.shape)
    # waveform = concat(waveform)
    # print(waveform[:, -1])