コード例 #1
0
ファイル: train.py プロジェクト: hemanths933/Transformers
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)

    '''
    Create dataloaders
    '''
    train_dataset = SplitReshapeTrainDataset(config['complex_sentences_file'], config['simple_sentences_file'])
    train_data, val_data = torch.utils.data.random_split(train_dataset, [round(config["train_data_percentage"] * len(train_dataset)), round(config["val_data_percentage"] * len(train_dataset))])

    train_dataloader = DataLoader(train_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)
    val_dataloader = DataLoader(val_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)

    '''
    create tokenizer
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./data/english_tokenizer-vocab.json",
        "./data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )


    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'],
           vocab_size,
           vocab_size,
           config['src_pad_idx'],
           config['num_heads'],
           config['num_encoder_layers'],
           config['num_decoder_layers'],
           config['forward_expansion'],
           config['dropout'],
           config['max_len'],
           device)

    model.train()

    trainer = model.to(device)

    '''
    Create Optimizer
    '''
    loss_fun = nn.CrossEntropyLoss(ignore_index = config['src_pad_idx'])
    optimizer = optim.Adam(trainer.parameters(), lr = config["learning_rate"])
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10)

    writer = SummaryWriter()

    return config, train_dataloader, val_dataloader, trainer, loss_fun, optimizer, writer, device, scheduler, tokenizer
コード例 #2
0
ファイル: dataset.py プロジェクト: JavClaude/LSTM_LM
 def _fit_tokenizer(
     path_to_text_file: Union[str, List[str]],
     tokenizer: ByteLevelBPETokenizer,
     vocabulary_size: int,
 ) -> None:
     tokenizer.train(
         path_to_text_file,
         vocabulary_size,
         special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN],
     )
コード例 #3
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)
    '''create tokenizers'''

    tokenizer = ByteLevelBPETokenizer(
        "data/english_tokenizer-vocab.json",
        "data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len'])
    tokenizer.enable_truncation(max_length=config['max_len'])
    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'], vocab_size, vocab_size,
                             config['src_pad_idx'], config['num_heads'],
                             config['num_encoder_layers'],
                             config['num_decoder_layers'],
                             config['forward_expansion'], config['dropout'],
                             config['max_len'], device)
    checkpoint = torch.load(config['pretrained_model'], map_location=device)
    model.load_state_dict(checkpoint['net'])
    model.eval()
    model = model.to(device)

    return config, model, tokenizer, device
コード例 #4
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info(" Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [
                line for line in f.read().splitlines()
                if (len(line) > 0 and not line.isspace())
            ]

        tokenizer = ByteLevelBPETokenizer(
            f"{args['tokenizer_name']}/vocab.json",
            f"{args['tokenizer_name']}/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        tokenizer.enable_truncation(max_length=block_size)
        self.examples = [t.ids for t in tokenizer.encode_batch(lines)]
コード例 #5
0
def test_language_model_dataset_fit_tokenizer_should_call_the_train_method_of_bpe_tokenizer(
):
    # Given
    language_modeling_dataset = LanguageModelingDataset(1, 1)
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train = MagicMock()
    language_modeling_dataset.set_tokenizer(tokenizer)

    # When
    language_modeling_dataset._fit_tokenizer(FAKE_PATH_FOR_TEST, tokenizer,
                                             300)

    # Then
    tokenizer.train.assert_called_with(
        FAKE_PATH_FOR_TEST,
        300,
        special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN],
    )
コード例 #6
0
def create_norwegian_tokenizer():
    tokenizer = ByteLevelBPETokenizer(
        "./models/KariBERTa-tiny/vocab.json",
        "./models/KariBERTa-tiny/merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=512)
    tokenizer.enable_padding()
    return tokenizer
コード例 #7
0
ファイル: pretrain.py プロジェクト: AlexanderFalk/nlpproject
    def pretrain_tokenization(self):
        paths = [str(x) for x in Path("handler/datadir/").glob("*-train.txt")]
        print(paths)
        tokenizer = ByteLevelBPETokenizer()

        tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

        tokenizer.save(".", "danbert-small")
コード例 #8
0
ファイル: tokenizer_train.py プロジェクト: HolmiumTS/ppppplm
def main():
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=get_file(),
                    vocab_size=config.VOCAB_SIZE,
                    min_frequency=config.MIN_FREQUENCY,
                    special_tokens=config.SPECIAL_TOKENS)

    tokenizer.save_model(config.TOKENIZER_PATH)
コード例 #9
0
def create_token_masker(tokenizer: ByteLevelBPETokenizer):
    special_tokens = [
        START_TOKEN, PAD_TOKEN, STOP_TOKEN, UNKNOWN_TOKEN, MASK_TOKEN
    ]
    special_token_ids = {
        tokenizer.token_to_id(token)
        for token in special_tokens
    }
    special_token_ids

    def get_special_tokens_mask(token_ids: torch.tensor):
        return [
            1 if token_id in special_token_ids else 0 for token_id in token_ids
        ]

    return get_special_tokens_mask
コード例 #10
0
    def __init__(self, evaluate: bool = false):
        tokenizer = ByteLevelBPETokenizer(
            "./esperberto-vocab.json",
            './esperberto-merges.txt',
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        src_files = Path("./")
コード例 #11
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./roberta-lm/vocab.json",
            "./roberta-lm/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/montecristo/").glob("**/*.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
コード例 #12
0
    def __init__(self, file_path: str = None, tokenizer_path: str = None):
        tokenizer = ByteLevelBPETokenizer(
            tokenizer_path + "/vocab.json",
            tokenizer_path + "/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            lines = f.readlines()
            lines = [
                line for line in lines
                if (len(line) > 0 and not line.isspace())
            ]
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
コード例 #13
0
ファイル: train.py プロジェクト: noel0714/medicalAnalysis
def train_tok(txt_dir, tokenizer_dir):
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=txt_dir,
                    vocab_size=52_000,
                    min_frequency=2,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    tokenizer.save_model(tokenizer_dir)
コード例 #14
0
    def __init__(self, evaluate=False):
        tokenizer = ByteLevelBPETokenizer(
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-vocab.json",
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        if evaluate:
            src_files = ["/home/zheng/sde/data/valid.txt"]
        else:
            src_files = ["/home/zheng/sde/data/test.txt"]

        for src_file in src_files:
            print(src_file)
            f = open(src_file, 'r')
            lines = f.readlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
コード例 #15
0
ファイル: Bert_pretraining.py プロジェクト: amir-jafari/NLP
import os
from tokenizers.implementations import ByteLevelBPETokenizer
from transformers import BertConfig
from transformers import BertTokenizer
from transformers import BertForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

PATH = os.getcwd()
SAVE_MODEL = os.getcwd()

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="kant.txt",
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.save_model(SAVE_MODEL)
tokenizer = ByteLevelBPETokenizer(
    SAVE_MODEL + "/vocab.json",
    SAVE_MODEL + "/merges.txt",
)

tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("For it is in reality vain to profess"))

config = BertConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
コード例 #16
0
from tokenizers.implementations import ByteLevelBPETokenizer
from transformers import LineByLineTextDataset

# config = RobertaConfig(
#     vocab_size=52_000,
#     max_position_embeddings=514,
#     num_attention_heads=12,
#     num_hidden_layers=6,
#     type_vocab_size=1,
# )

tokenizer = ByteLevelBPETokenizer(
    "./models/nepali_BERT_tokenizer_L-vocab.json",
    "./models/nepali_BERT_tokenizer_L-merges.txt",
)

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=
    "../Data/nepali_corpus/sanitized_data/sentences/master_sentence_list.txt",  #this path is from step 1
    block_size=128,
)

#Need this util for data back propagation
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

#Set up training arguments
コード例 #17
0
                               input_path_val,
                               output_path,
                               vocab_size=30_000,
                               min_freq=2,
                               max_len=256,
                               block_size=64,
                               mlm_probability=0.15,
                               num_attention_heads=6,
                               num_hidden_layers=3,
                               epochs=5,
                               batch_size=30,
                               val_batch_size=60,
                               eval_steps=50,
                               **kwargs):
 # instantiate tokenizer
 bpe_tokenizer = ByteLevelBPETokenizer()
 # train tokenizer
 _pretty_print("Training tokenizer")
 bpe_tokenizer.train([input_path, input_path_val],
                     vocab_size=vocab_size,
                     min_frequency=min_freq,
                     special_tokens=[
                         "<s>",
                         "<pad>",
                         "</s>",
                         "<unk>",
                         "<mask>",
                     ])
 # save tokenizer
 tok_path = os.path.join(output_path, "tokenizer")
 os.makedirs(tok_path, exist_ok=True)
コード例 #18
0
from tokenizers.implementations import ByteLevelBPETokenizer

from language_model.tokenization.trainer import ByteLevelBPETokenizerTrainer

task = ByteLevelBPETokenizerTrainer(
    source_folder_path="data/ukr/data/wiki_oscar_data/",
    tokenizer=ByteLevelBPETokenizer(),
    vocab_size=52000,
    min_frequency=5,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
)
コード例 #19
0
    with open('data/start.txt', 'r') as myfile:
        data = myfile.read()

    segmenter.segment('I am Batman i live in gotham')

# =============================================================================
# Huggingface tokenizer
# =============================================================================

if False:
    from tokenizers.implementations import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    from pathlib import Path

    tokenizer = ByteLevelBPETokenizer(
        "data/german_old.json",
        "data/german_old.txt",
    )

    tokenizer = ByteLevelBPETokenizer(
        "data/german_old.json",
        "data/german_old.txt",
    )

    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )

    tokenizer.enable_truncation(max_length=512)

    #print(tokenizer.encode(sen_out[0]))
コード例 #20
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import torch

tokenizer = ByteLevelBPETokenizer(
    "./EsperBERTo/vocab.json",
    "./EsperBERTo/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

from transformers_weighted_head.src.transformers import RobertaConfig

WEIGHT_HEADS = True

config = RobertaConfig(
    weight_heads=WEIGHT_HEADS,
    vocab_size=52_000,
    max_position_embeddings=514,
    hidden_size=480,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

from transformers_weighted_head.src.transformers import RobertaTokenizerFast
コード例 #21
0
ファイル: tokenizer_test.py プロジェクト: kandorm/CLINE
# -*- coding:utf-8 -*-
import os
from argparse import ArgumentParser
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing, RobertaProcessing

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--token_path",
                        type=str,
                        nargs='?',
                        required=True,
                        help="")
    args = parser.parse_args()

    inputpath = args.token_path
    tokenizer = ByteLevelBPETokenizer(os.path.join(inputpath, "vocab.json"),
                                      os.path.join(inputpath, "merges.txt"),
                                      add_prefix_space=True,
                                      trim_offsets=True,
                                      lowercase=True,
                                      unicode_normalizer="nfkc")
    tokenizer._tokenizer.post_processor = RobertaProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
        trim_offsets=True,
        add_prefix_space=True)
    tokenizer.enable_truncation(max_length=512)
    tokens = tokenizer.encode("I am Julien\nI am from China.").tokens
    print([x.encode('utf-8') for x in tokens])
コード例 #22
0
ファイル: train.py プロジェクト: jamiekang/sagemaker-aihub
                    default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--output-data-dir',
                    type=str,
                    default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--data-dir',
                    type=str,
                    default=os.environ['SM_CHANNEL_TRAINING'])

args = parser.parse_args()

paths = [str(x) for x in Path(args.data_dir).glob("**/*.txt")]
print("data files")
print(paths)

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Need to save it to model dir for inference
tokenizer.save(args.model_dir)

tokenizer = ByteLevelBPETokenizer(os.path.join(args.model_dir, "vocab.json"),
                                  os.path.join(args.model_dir, "merges.txt"))

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
コード例 #23
0
ファイル: __main__.py プロジェクト: JavClaude/LSTM_LM
def main():
    argument_parser = argparse.ArgumentParser()
    argument_parser.add_argument("--path_to_train_data",
                                 type=str,
                                 required=True)
    argument_parser.add_argument("--path_to_eval_data",
                                 type=str,
                                 required=False,
                                 default=None)
    argument_parser.add_argument("--n_epochs",
                                 type=int,
                                 required=False,
                                 default=3)
    argument_parser.add_argument("--batch_size",
                                 type=int,
                                 required=False,
                                 default=32)
    argument_parser.add_argument("--bptt",
                                 type=int,
                                 required=False,
                                 default=64)
    argument_parser.add_argument("--lr",
                                 type=float,
                                 required=False,
                                 default=0.0001)
    argument_parser.add_argument("--vocabulary_size",
                                 type=int,
                                 required=False,
                                 default=20000)
    argument_parser.add_argument("--embedding_dimension",
                                 type=int,
                                 required=False,
                                 default=300)
    argument_parser.add_argument("--hidden_units_for_lstm",
                                 type=int,
                                 required=False,
                                 default=256)
    argument_parser.add_argument("--num_of_lstm_layer",
                                 type=int,
                                 required=False,
                                 default=1)
    argument_parser.add_argument("--n_decoder_blocks",
                                 type=int,
                                 required=False,
                                 default=5)

    arguments = argument_parser.parse_args()

    train_language_modeling_dataset = LanguageModelingDataset(
        arguments.batch_size, arguments.bptt)
    train_language_modeling_dataset.set_tokenizer(ByteLevelBPETokenizer())
    train_language_modeling_dataset.fit(
        arguments.path_to_train_data,
        vocabulary_size=arguments.vocabulary_size)

    train_language_modeling_dataloader = LanguageModelingDataLoader(
        arguments.bptt,
        train_language_modeling_dataset.transform(arguments.path_to_train_data,
                                                  return_target=True),
    )

    model = LSTMModel(
        arguments.vocabulary_size,
        arguments.embedding_dimension,
        arguments.hidden_units_for_lstm,
        arguments.n_decoder_blocks,
        arguments.num_of_lstm_layer,
    )

    logger = TensorboardLogger()
    trainer = Trainer(arguments.batch_size)
    trainer.set_logger(logger)

    if arguments.path_to_eval_data:
        eval_language_modeling_dataloader = LanguageModelingDataLoader(
            arguments.bptt,
            train_language_modeling_dataset.transform(
                arguments.path_to_eval_data, return_target=True),
        )

        trainer.train(
            model,
            train_language_modeling_dataloader,
            CrossEntropyLoss(),
            Adam(model.parameters(), arguments.lr),
            eval_language_modeling_dataloader,
            arguments.n_epochs,
        )

    else:
        trainer.train(
            model,
            train_language_modeling_dataloader,
            CrossEntropyLoss(),
            Adam(model.parameters(), arguments.lr),
            None,
            arguments.n_epochs,
        )

    logger.log_params(vars(arguments), trainer.losses)
    saver = Saver(logger.log_dir())
    saver.save_preprocessor_and_model(train_language_modeling_dataset, model)
コード例 #24
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./bert-tokenizer/vocab.json",
    "./bert-tokenizer/merges.txt",
)

from tokenizers.processors import BertProcessing

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

### Testing

tokenizer.encode('TOBB ETU NLP&IR team')

コード例 #25
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import os.path
import sys
ROOT_DIRECTORY = os.path.join(os.path.dirname(__file__), '..')
sys.path.append(ROOT_DIRECTORY)

tokenizer = ByteLevelBPETokenizer(
    os.path.join(ROOT_DIRECTORY, "models/en_cycl_tokenizer/vocab.json"),
    os.path.join(ROOT_DIRECTORY, "models/en_cycl_tokenizer/merges.txt"),
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
token_to_encode = "(() ((#$isa #$McCoyTyner-Musician #$Individual)))"
encoded_token = tokenizer.encode(token_to_encode)
print(encoded_token)
print(encoded_token.tokens)

token_to_encode = "Pair of scissors is marketed as office product."
encoded_token = tokenizer.encode(token_to_encode)
print(encoded_token)
print(encoded_token.tokens)
コード例 #26
0
ファイル: transformer.py プロジェクト: rdenadai/BR-BERTo
        return example


# Check that PyTorch sees it
print("CUDA:", torch.cuda.is_available())
corpus_length = 6_993_330 # fazer um wc -l para ver a qtde de linhas
vocab_size = 150_000

# Dataset files
# --------------------------------------------------
paths = [str(x) for x in Path("./").glob("**/corpus.txt")]

# Byte Level Tokernize
# --------------------------------------------------
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
# Save files to disk
tokenizer.save_model("BR_BERTo")
# Test
tokenizer = ByteLevelBPETokenizer(
    "./BR_BERTo/vocab.json",
    "./BR_BERTo/merges.txt",
)
コード例 #27
0
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

path = "./model"

tokenizer = ByteLevelBPETokenizer(
    "{path}/vocab.json".format(path=path),
    "{path}/merges.txt".format(path=path),
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

tokenizer.encode("Dette er første testen.")

tokens = tokenizer.encode("Dette er første testen.").tokens

print(tokens)