Example #1
0
 def __init__(self, bpe_path = None):
     bpe_path = Path(bpe_path)
     assert bpe_path.exists(), f'BPE json path {str(bpe_path)} does not exist'
     tokenizer = Tokenizer.from_file(str(bpe_path))
     tokenizer.post_processor = ByteLevel(trim_offsets = True)
     self.tokenizer = tokenizer
     self.vocab_size = tokenizer.get_vocab_size()
Example #2
0
def main():
    tokenizer = BytePairEncoder().prepare_tokenizer()
    trainer = BytePairEncoder().prepare_trainer()
    tokenizer.train(files=["data/tr_penn-ud-train.txt", "data/tr_penn-ud-dev.txt"], trainer=trainer)
    print("Trained vocab size with BPE Tokenizer: {}".format(tokenizer.get_vocab_size()))
    #tokenizer.model.save('data/BPE')

    #tokenizer.model = BPE.from_file('data/BPE/vocab.json', 'data/BPE/merges.txt')
    tokenizer.post_processor = ByteLevel(trim_offsets=True)
    tokenizer.enable_truncation(max_length=512)
    
    encoded = tokenizer.encode('Bu bir kelime oyunudur a dostlar!')
    decoded = tokenizer.decode(encoded.ids)
    print('Decoded: ', decoded)
    
    # Save the vocabulary 
    tokenizer.model.save('result/.')

    with open("result/vocab.json") as f:
       bpe_vocab = json.load(f)
    
    bpe_vocab_list = [(v, k) for k, v in bpe_vocab.items()]
    
    training_corpus, testing_corpus = DataLoader.load_data()
    training_encoding = tokenizer.encode(training_corpus)
    testing_encoding = tokenizer.encode(testing_corpus)

    train_bpe_subwords = training_encoding.tokens
    test_bpe_subwords = testing_encoding.tokens
    
    perplexity = PPEvaluator().compute_pp(2, train_bpe_subwords, test_bpe_subwords)
    print ("Perplexity of BpeTokenizer: {0}".format(perplexity))
Example #3
0
    def test_processing(self, roberta_files):
        tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
        tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)

        # Keeps original offsets
        output = tokenizer.encode("My name is John")
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]

        # Trims offsets when activated
        tokenizer.post_processor = ByteLevel(trim_offsets=True)
        output = tokenizer.encode("My name is John")
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
Example #4
0
 def test_instantiate(self):
     assert ByteLevel() is not None
     assert ByteLevel(trim_offsets=True) is not None
     assert isinstance(ByteLevel(), PostProcessor)
     assert isinstance(ByteLevel(), ByteLevel)
Example #5
0
 def test_manual_reload(self):
     byte_level = ByteLevel()
     state = json.loads(byte_level.__getstate__())
     reloaded = ByteLevel(**state)
     assert isinstance(reloaded, ByteLevel)