def test_instantiate(self): processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) assert processor is not None assert isinstance(processor, PostProcessor) assert isinstance(processor, BertProcessing) assert isinstance( pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing, )
def initialize_model(): config = get_config() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') print("device", device) '''create tokenizers''' tokenizer = ByteLevelBPETokenizer( "data/english_tokenizer-vocab.json", "data/english_tokenizer-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len']) tokenizer.enable_truncation(max_length=config['max_len']) ''' Create model ''' vocab_size = len(tokenizer.get_vocab()) print("tokenizer.vocab_size", vocab_size) model = TransformerModel(config['embedding_size'], vocab_size, vocab_size, config['src_pad_idx'], config['num_heads'], config['num_encoder_layers'], config['num_decoder_layers'], config['forward_expansion'], config['dropout'], config['max_len'], device) checkpoint = torch.load(config['pretrained_model'], map_location=device) model.load_state_dict(checkpoint['net']) model.eval() model = model.to(device) return config, model, tokenizer, device
def __init__(self, evaluate: bool = False): tokenizer = ByteLevelBPETokenizer( "./model/bbpe/vocab.json", "./model/bbpe/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] src_files = Path("./data/").glob("*_eval.csv") if evaluate else Path( "./data/").glob("*_eval.csv") for src_file in src_files: print("🔥", src_file) with open(src_file, 'r', encoding='utf-8') as f: for index, line in enumerate(f): self.examples += [ x.ids for x in tokenizer.encode_batch(line) ] if index % 10000 == 0: print(src_file, index // 10000)
def __init__(self, t: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path) logger.info("Creating features from dataset file at %s", file_path) # -------------------------- CHANGES START bert_tokenizer = os.path.join(args.tokenizer_name, "vocab.txt") if os.path.exists(bert_tokenizer): logger.info("Loading BERT tokenizer") from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer(os.path.join(args.tokenizer_name, "vocab.txt"), handle_chinese_chars=False, lowercase=False) tokenizer.enable_truncation(512) else: from tokenizers import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing logger.info("Loading RoBERTa tokenizer") tokenizer = ByteLevelBPETokenizer( os.path.join(args.tokenizer_name, "vocab.json"), os.path.join(args.tokenizer_name, "merges.txt") ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) logger.info("Reading file %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] logger.info("Running tokenization") self.examples = tokenizer.encode_batch(lines)
def get_tokenizer(path): tokenizer = ByteLevelBPETokenizer(path + 'vocab.json', path + 'merges.txt') tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) return tokenizer
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info(" Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [ line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace()) ] tokenizer = ByteLevelBPETokenizer( f"{args['tokenizer_name']}/vocab.json", f"{args['tokenizer_name']}/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=block_size) self.examples = [t.ids for t in tokenizer.encode_batch(lines)]
def test_tokenizer(test_sentence, vocab_path, merge_path): r""" Illustrates how the individual Tokenizer works Args: test_sentence (:obj:`str`): Sentence for demonstration purposes vocab_path (:obj:`str`): Path where the vocabulary (most frequent tokens ranked by frequency) is saved merge_path (:obj:`str`): Path where the merges file is saved """ tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=512) print("Original sentence " + test_sentence) print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens)) encoding = tokenizer.encode(test_sentence) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
def initialize_model(): config = get_config() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') print("device", device) ''' Create dataloaders ''' train_dataset = SplitReshapeTrainDataset(config['complex_sentences_file'], config['simple_sentences_file']) train_data, val_data = torch.utils.data.random_split(train_dataset, [round(config["train_data_percentage"] * len(train_dataset)), round(config["val_data_percentage"] * len(train_dataset))]) train_dataloader = DataLoader(train_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True) val_dataloader = DataLoader(val_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True) ''' create tokenizer ''' tokenizer = ByteLevelBPETokenizer( "./data/english_tokenizer-vocab.json", "./data/english_tokenizer-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) ''' Create model ''' vocab_size = len(tokenizer.get_vocab()) print("tokenizer.vocab_size", vocab_size) model = TransformerModel(config['embedding_size'], vocab_size, vocab_size, config['src_pad_idx'], config['num_heads'], config['num_encoder_layers'], config['num_decoder_layers'], config['forward_expansion'], config['dropout'], config['max_len'], device) model.train() trainer = model.to(device) ''' Create Optimizer ''' loss_fun = nn.CrossEntropyLoss(ignore_index = config['src_pad_idx']) optimizer = optim.Adam(trainer.parameters(), lr = config["learning_rate"]) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10) writer = SummaryWriter() return config, train_dataloader, val_dataloader, trainer, loss_fun, optimizer, writer, device, scheduler, tokenizer
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file)) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def from_pretrained(cls, tokenizer_name, cache_dir=None): tokenizer = KariBERTaTokenizer(tokenizer_name) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.enable_padding() return tokenizer
def __init__(self, cfg): super().__init__(cfg) self.scales = [str((cfg.load_size // (2**i))) for i in range(3)] self.scales.reverse() self.device_map = { 'style': self.devices[0], 'content': self.devices[0], 'img': self.devices[0] } self.network_names = [ 'style_model', 'content_model', 'generator', 'discriminators' ] self.device_name_map = { 'style_model': 'style', 'content_model': 'content', 'generators': 'img', 'discriminators': 'img' } tokenizer = ByteLevelBPETokenizer( "vocab.json", "merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) self.cold = True self.language_model = LanguageModel(cfg, tokenizer, self.device_map['style']).to( self.device_map['style']) self.content_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'], cfg).to(self.device_map['style']) self.style_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'], cfg).to(self.device_map['style']) self.generator = StyleGenerator(cfg).to(self.device_map['img']) self.discriminator = FeatureConvolutionalDiscriminator(cfg).to( self.device_map['img']) self.visual_names = ['visual_dict'] self.visual_dict = {'real': None, 'fake': None} self.loss_names = ['loss'] self.visualizer = Visualizer(cfg) self.generator_criterion = BinaryCrossEntropyLoss(cfg).to( self.device_map['img']) self.consistency_criterion = ColorConsistencyLoss(cfg).to( self.device_map['img']) self.distribution_criterion = KLDLoss().to(self.device_map['img']) self.latent_scale = int(cfg.load_size // (2**6)) self.latent_channels = int(cfg.latent_dim) // (self.latent_scale**2) self.channels_z = 8 * self.cfg.ngf - self.latent_channels
def test_processing(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"] assert output.ids == [1, 2, 3, 0, 6, 0]
def __init__( self, vocab_file: Optional[str] = None, add_special_tokens: bool = True, unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordPiece.empty()) tokenizer.add_special_tokens([unk_token, sep_token, cls_token]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if add_special_tokens and vocab_file is not None: sep_token_id = tokenizer.token_to_id(sep_token) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(cls_token) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (sep_token, sep_token_id), (cls_token, cls_token_id)) tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "add_special_tokens": add_special_tokens, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def train_tokenizer(input_path, output_path, vocab_size=10000): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=[input_path], vocab_size=vocab_size, special_tokens=["[PAD]", "<s>", "</s>", "<unk>"]) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.save_model(output_path) return tokenizer
def test_bert_parity(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) original = tokenizer.encode("my name", "pair") tokenizer.post_processor = self.get_bert() template = tokenizer.encode("my name", "pair") assert original.ids == template.ids
def __init__(self, max_tokens=512): ## RoBERTa uses BPE tokenizer similar to GPT t = ByteLevelBPETokenizer("tokenizer/vocab.json", "tokenizer/merges.txt") t._tokenizer.post_processor = BertProcessing( ("</s>", t.token_to_id("</s>")), ("<s>", t.token_to_id("<s>")), ) t.enable_truncation(max_tokens) t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>")) self.tokenizer = t
def load_sentence_piece_model(): tokenizer = ByteLevelBPETokenizer(path_vocab, path_model) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=512) encoding = tokenizer.encode("배고파요") print(encoding.tokens) print(encoding.special_tokens_mask) print(encoding.ids) print(encoding.normalized_str)
def create_norwegian_tokenizer(): tokenizer = ByteLevelBPETokenizer( "./models/KariBERTa-tiny/vocab.json", "./models/KariBERTa-tiny/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.enable_padding() return tokenizer
def __init__(self, tok_dir, max_seq_len, **kwargs): tokenizer = CharBPETokenizer(f"{tok_dir}/vocab.json", f"{tok_dir}/merges.txt") tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=max_seq_len) self._pretrained_tokenizer = tokenizer self.max_seq_len = max_seq_len
def __load_tokenizer(self): parent_path, _ = os.path.split(__file__) data_path = os.path.join(parent_path, "data") tokenizer_path = os.path.join(data_path, str(self.__vocab_size), str(self.__min_frequence), self.__tokenizer_name) self.__tokenizer = RobertaTokenizerFast.from_pretrained( tokenizer_path, max_len=self.max_length) self.__tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", self.__tokenizer.convert_tokens_to_ids("</s>")), ("<s>", self.__tokenizer.convert_tokens_to_ids("<s>")), )
def __init__(self, evaluate: bool = false): tokenizer = ByteLevelBPETokenizer( "./esperberto-vocab.json", './esperberto-merges.txt', ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) self.examples = [] src_files = Path("./")
def load_custom_tokenizer(self, path): tokenizer = ByteLevelBPETokenizer(path + "-vocab.json", path + "-merges.txt") # Add preprocessing tokens like Roberta tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) return PreTrainedTokenizerFast(tokenizer, pad_token="<pad>", mask_token="<mask>", unk_token="<unk>", bos_token="<s>", eos_token="</s>")
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def load_tokenizer(path, enable_truncation=True, enable_padding=True, max_length=512): tokenizer = SentencePieceBPETokenizer(os.path.join(path, "vocab.json"), os.path.join(path, "merges.txt")) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) if enable_truncation: tokenizer.enable_truncation(max_length=max_length) if enable_padding: tokenizer.enable_padding(pad_token="<pad>", pad_id=tokenizer.token_to_id("<pad>")) return tokenizer
def main(args): data = np.load(args.data, allow_pickle=True) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.convert_tokens_to_ids("</s>")), ("<s>", tokenizer.convert_tokens_to_ids("<s>")), ) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) dataset = PhoneDatasetMLM(data, tokenizer) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=args.output_dir, overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=64, logging_steps=2, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(args.output_dir)
def __init__(self, args, file_path: str): tokenizer = CharBPETokenizer( f'{args.tokenizer_name}/vocab.json', f'{args.tokenizer_name}/merges.txt', ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=256) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def __init__(self, evaluate: bool = False): tokenizer = ByteLevelBPETokenizer( "./roberta-lm/vocab.json", "./roberta-lm/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] src_files = Path("./data/montecristo/").glob("**/*.txt") for src_file in src_files: print("🔥", src_file) lines = src_file.read_text(encoding="utf-8").splitlines() self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def __init__( self, vocab_file, sep_token="<sep>", cls_token="<cls>", pad_token="<pad>", mask_token="<mask>", lowercase: bool = True, ): tokenizer = Tokenizer(WordLevel(vocab_file, unk_token=unk_token)) tokenizer.normalizer = Strip() tokenizer.pre_tokenizer = CharDelimiterSplit(" ") tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def load_sentence_piece_model(path_vocab, path_model): tokenizer = ByteLevelBPETokenizer(path_vocab, path_model) tokenizer._tokenizer.post_processor = BertProcessing( ("<bos>", tokenizer.token_to_id("<bos>")), ("<eos>", tokenizer.token_to_id("<eos>")) ) tokenizer.enable_truncation(max_length=512) # encoding = tokenizer.encode("배고파요") # print(encoding.tokens) # print(encoding.special_tokens_mask) # print(encoding.ids) # print(encoding.normalized_str) # # decoding = tokenizer.decode([2, 1177, 276, 692, 571, 1]) # print(decoding) return tokenizer
def __init__(self, file_path: str = None, tokenizer_path: str = None): tokenizer = ByteLevelBPETokenizer( tokenizer_path + "/vocab.json", tokenizer_path + "/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) self.examples = [] with open(file_path, encoding="utf-8") as f: lines = f.readlines() lines = [ line for line in lines if (len(line) > 0 and not line.isspace()) ] self.examples += [x.ids for x in tokenizer.encode_batch(lines)]