def __init__(self, bpe_path = None): bpe_path = Path(bpe_path) assert bpe_path.exists(), f'BPE json path {str(bpe_path)} does not exist' tokenizer = Tokenizer.from_file(str(bpe_path)) tokenizer.post_processor = ByteLevel(trim_offsets = True) self.tokenizer = tokenizer self.vocab_size = tokenizer.get_vocab_size()
def main(): tokenizer = BytePairEncoder().prepare_tokenizer() trainer = BytePairEncoder().prepare_trainer() tokenizer.train(files=["data/tr_penn-ud-train.txt", "data/tr_penn-ud-dev.txt"], trainer=trainer) print("Trained vocab size with BPE Tokenizer: {}".format(tokenizer.get_vocab_size())) #tokenizer.model.save('data/BPE') #tokenizer.model = BPE.from_file('data/BPE/vocab.json', 'data/BPE/merges.txt') tokenizer.post_processor = ByteLevel(trim_offsets=True) tokenizer.enable_truncation(max_length=512) encoded = tokenizer.encode('Bu bir kelime oyunudur a dostlar!') decoded = tokenizer.decode(encoded.ids) print('Decoded: ', decoded) # Save the vocabulary tokenizer.model.save('result/.') with open("result/vocab.json") as f: bpe_vocab = json.load(f) bpe_vocab_list = [(v, k) for k, v in bpe_vocab.items()] training_corpus, testing_corpus = DataLoader.load_data() training_encoding = tokenizer.encode(training_corpus) testing_encoding = tokenizer.encode(testing_corpus) train_bpe_subwords = training_encoding.tokens test_bpe_subwords = testing_encoding.tokens perplexity = PPEvaluator().compute_pp(2, train_bpe_subwords, test_bpe_subwords) print ("Perplexity of BpeTokenizer: {0}".format(perplexity))
def test_processing(self, roberta_files): tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"])) tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True) # Keeps original offsets output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] # Trims offsets when activated tokenizer.post_processor = ByteLevel(trim_offsets=True) output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(trim_offsets=True) is not None assert isinstance(ByteLevel(), PostProcessor) assert isinstance(ByteLevel(), ByteLevel)
def test_manual_reload(self): byte_level = ByteLevel() state = json.loads(byte_level.__getstate__()) reloaded = ByteLevel(**state) assert isinstance(reloaded, ByteLevel)