def create_trainer(tokenizer, model): dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="data/processed/recipes_train.txt", block_size=256, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) training_args = TrainingArguments( output_dir="./artifacts", overwrite_output_dir=True, num_train_epochs=1, per_gpu_train_batch_size=128, save_steps=100_000_000, save_total_limit=2, fp16=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) return trainer
def test_trainer_eval_lm(self): MODEL_ID = "distilroberta-base" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=tokenizer.max_len_single_sentence, ) self.assertEqual(len(dataset), 31)
def __init__(self, opts): # Command line arguments self.opts = opts # Load model and tokenizer config = AutoConfig.from_pretrained(opts.ckpt_file) self.tokenizer = AutoTokenizer.from_pretrained(opts.ckpt_file) self.model = AutoModelWithLMHead.from_pretrained(opts.ckpt_file, config=config) self.model.resize_token_embeddings(len(self.tokenizer)) # Load training arguments if opts.mode == 'train' or opts.mode == 'eval': self.training_args = TrainingArguments self.training_args.device = 'cpu' self.training_args.n_gpu = 0 self.training_args.logging_dir = opts.output_dir self.training_args.output_dir = opts.output_dir self.training_args.num_train_epochs = opts.num_epochs self.training_args.learning_rate = opts.learning_rate self.training_args.train_batch_size = opts.batch_size self.training_args.eval_batch_size = opts.batch_size # Load dataset if opts.mode == 'train' or opts.mode == 'eval': self.dataset = LineByLineTextDataset( # TextDataset tokenizer=self.tokenizer, file_path=opts.text_file, block_size=self.tokenizer.max_len) self.data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=False)
def test_plm(self): tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") data_collator = DataCollatorForPermutationLanguageModeling(tokenizer) # ^ permutation lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((31, 112))) self.assertEqual(batch["perm_mask"].shape, torch.Size((31, 112, 112))) self.assertEqual(batch["target_mapping"].shape, torch.Size((31, 112, 112))) self.assertEqual(batch["labels"].shape, torch.Size((31, 112))) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 512, 512))) self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 512, 512))) self.assertEqual(batch["labels"].shape, torch.Size((2, 512))) example = [torch.randint(5, [5])] with self.assertRaises(ValueError): # Expect error due to odd sequence length data_collator(example)
def execute(self, environment_path: str) -> None: dataset = LineByLineTextDataset(tokenizer=self.tokenizer, file_path=self.file_path, block_size=self.block_size) data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=True, mlm_probability=self.mlm_probability) training_args = TrainingArguments( output_dir=os.path.join(environment_path, "temp"), overwrite_output_dir=True, num_train_epochs=self.epochs, per_gpu_train_batch_size=self.batch_size_per_gpu, save_steps=self.save_steps, save_total_limit=self.save_total_limit, ) trainer = Trainer( model=self.model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) trainer.train() trainer.save_model(os.path.join(environment_path, "model")) self.tokenizer.save_pretrained( os.path.join(environment_path, "tokenizer"))
def get_dataset(filepath, tokenizer, block_size, line_by_line=False, overwrite_cache=False): ''' Load a dataset from the specified filepath. :param filepath: The filepath of the dataset. :param tokenizer: The tokenizer to parse the dataset with. :param block_size: The length of a single input sequence (block). :param line_by_line: Indicates whether distinct lines of text in the dataset are to be handled as separate sequence (i.e. whether to add the BOS adn EOS tokens to each line). Defaults to False. :param overwrite_cache: Overwrite the cached training and evaluation sets. Defaults to False. :returns: A :class:`torch.utils.data.Dataset` object. ''' if line_by_line: return LineByLineTextDataset(tokenizer=tokenizer, file_path=filepath, block_size=block_size) else: return TextDataset(tokenizer=tokenizer, file_path=filepath, block_size=block_size, overwrite_cache=overwrite_cache)
def _dataset(file_path, ref_path=None): if args.line_by_line: if ref_path is not None: if not args.whole_word_mask or not args.mlm: raise ValueError( "You need to set world whole masking and mlm to True for Chinese Whole Word Mask" ) return LineByLineWithRefDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, ref_path=ref_path, ) return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) else: return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache, cache_dir=cache_dir, )
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, max_len, evaluate=False): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) elif args.text_dataset: return TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache) else: """ When use common tab separated text dataset, use nlp.data.TSVDataset. If you want to use other type of dataset, refer to other class of nlp.data, or set DataTrainingArguments.line_by_line or DataTrainingArguments.text_dataset True. """ dataset = nlp.data.TSVDataset(file_path, field_indices=[1], num_discard_samples=1) return Get_dataset(dataset, 0, tokenizer, max_len, True, False)
def get_train_data(epoch): p = Path('data/raw/oscar') / f'he_dedup-train-{(epoch % 2) + 1}.txt' logger.info(f'{transformer_type} training data: {p}') return LineByLineTextDataset( tokenizer=tokenizer, file_path=str(p), block_size=128, )
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) else: return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache )
def finetune_model(transformers_model_name: str, corpus_file_path: str): config = AutoConfig.from_pretrained( transformers_model_name, force_download=False, cache_dir='../data/download_transformer_models') tokenizer = AutoTokenizer.from_pretrained( transformers_model_name, force_download=False, cache_dir='../data/download_transformer_models') # tokenizer = RobertaTokenizerFast.from_pretrained(transformers_model_name,force_download=False,cache_dir='../data/download_transformer_models') model = AutoModelForMaskedLM.from_pretrained( transformers_model_name, config=config, force_download=False, cache_dir='../data/download_transformer_models') dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=corpus_file_path, block_size=512) train_set, valid_set = train_test_split(dataset, test_size=0.25, random_state=32) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir="../data/finetune_transformer_models/", logging_dir='../saved/finetune_logging', logging_steps=500, overwrite_output_dir=True, weight_decay=0.01, adam_epsilon=1e-6, learning_rate=2e-5, num_train_epochs=5, per_gpu_train_batch_size=4, per_gpu_eval_batch_size=32, max_grad_norm=5.0, save_steps=1000, save_total_limit=2, gradient_accumulation_steps=32, evaluate_during_training=True, do_train=True, do_eval=True, do_predict=False) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_set, eval_dataset=valid_set, ) os.environ["CUDA_VISIBLE_DEVICES"] = "0" trainer.train()
def _dataset(file_path): if args.line_by_line: return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) else: return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache, )
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: return LineByLineTextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank ) else: return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank, )
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, model_args: ModelArguments, evaluate=False, cache_dir=None): file_path = args.eval_data_file if evaluate else args.train_data_file return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=cache_dir)
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer,model_args:ModelArguments, evaluate=False): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: if args.mlm_sample_times > 1: return FullyLineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir) else: return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir) else: return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache )
def train(): dialogues = pd.read_csv('/content/TlkPersonaChatRus/dialogues.tsv', sep='\t') for column in dialogues.columns: dialogues[column].replace(to_replace=r'<[a-zA-Z0-9_=\/ ]+>', value=' ', regex=True, inplace=True) dialogues['dialogue'].replace( to_replace=r'Пользователь [12]:|Привет|Здравствуйте|[!)?,]', value='', regex=True, inplace=True) dialogues['dialogue'].replace(to_replace=r'\s\s+', value=' ', regex=True, inplace=True) dialogues = dialogues['dialogue'] dialogues.to_csv('./Datasets/dialogues') tokenizer = AutoTokenizer.from_pretrained('distilgpt2') model = AutoModelWithLMHead.from_pretrained('distilgpt2') tokenizer.pad_token = tokenizer.eos_token dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path= '/content/drive/MyDrive/semester-practice-3rd/Datasets/dialogues.txt', block_size=128, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) training_args = TrainingArguments( output_dir= '/content/drive/MyDrive/semester-practice-3rd/Models/distilgpt2', overwrite_output_dir=True, num_train_epochs=10, per_device_train_batch_size=8, ) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=dataset) trainer.train() trainer.save_model('model/gpt2_chat')
def main(): model_path, data_path, output_path = set_path() bort_tokenizer = BertTokenizer.from_pretrained(model_path) seed_everyone(20210409) dataset = LineByLineTextDataset( tokenizer=bort_tokenizer, file_path=data_path, block_size=42, ) model = NeZhaForMaskedLM.from_pretrained(model_path) data_collator = DataCollatorForLanguageModeling(tokenizer=bort_tokenizer, mlm=True, mlm_probability=0.15) logging_path = os.path.join(output_path, 'log') model_save_path = os.path.join(output_path, 'best_model_ckpt') tokenizer_and_configs = os.path.join(output_path, 'tokenizer_and_configs') check_path(model_save_path) check_path(logging_path) check_path(tokenizer_and_configs) training_args = TrainingArguments( output_dir=output_path, overwrite_output_dir=True, num_train_epochs=60, # 60 learning_rate=6e-5, fp16_backend='auto', per_device_train_batch_size=128, # 64 save_steps=1000, # 1000 logging_steps=1000, save_total_limit=10, # 10 run_name='80', logging_dir=logging_path, logging_first_step=True, dataloader_num_workers=4, disable_tqdm=False, seed=20200409) nezha_bert_trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) nezha_bert_trainer.train() nezha_bert_trainer.save_model(model_save_path) bort_tokenizer.save_pretrained(tokenizer_and_configs)
def get_dataset( args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate: bool = False, cache_dir: Optional[str] = None, ): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) else: return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache, cache_dir=cache_dir, )
def test_lm_tokenizer_without_padding(self): tokenizer = AutoTokenizer.from_pretrained("gpt2") data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # ^ causal lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] with self.assertRaises(ValueError): # Expect error due to padding token missing on gpt2: data_collator.collate_batch(examples) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))
def build(config): tokenizer = RobertaTokenizerFast.from_pretrained( os.path.join(config.save_directory), max_len=config.max_length ) model_config = RobertaConfig( vocab_size=config.vocab_size, max_position_embeddings=config.max_length, num_attention_heads=config.num_attention_heads, num_hidden_layers=config.num_hidden_layers, type_vocab_size=1 ) model = RobertaForMaskedLM(config=model_config) print("the number of parameters of model: ", model.num_parameters()) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=config.files, block_size=32 ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability ) training_args = TrainingArguments( output_dir=os.path.join(config.save_directory), overwrite_output_dir=config.overwrite_output_dir, num_train_epochs=config.num_train_epochs, per_gpu_train_batch_size=config.per_gpu_train_batch_size, save_steps=config.save_steps, save_total_limit=config.save_total_limit ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=config.prediction_loss_only ) return trainer
def main(): args = parse_arguments() if args.input_model is None: model = GPT2LMHeadModel.from_pretrained("antoiloui/belgpt2") else: print('loading pre trained model') model = GPT2LMHeadModel.from_pretrained(args.input_model) tokenizer = GPT2Tokenizer.from_pretrained("antoiloui/belgpt2") training_args = TrainingArguments( output_dir=args.output_dir + '_checkpoint', # output directory num_train_epochs=3, # total number of training epochs per_device_train_batch_size=64, # batch size per device during training warmup_steps=100, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs_hyca', # directory for storing logs logging_steps=100, ) special_tokens_dict = { 'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>' } tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=args.input_file, block_size=32) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) if args.input_model is not None: trainer.train(resume_from_checkpoint=args.input_model + '_checkpoint') else: trainer.train() model.save_pretrained(args.output_dir)
def test_lm_tokenizer_with_padding(self): tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") data_collator = DataCollatorForLanguageModeling(tokenizer) # ^ masked lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107))) self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107))) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512)))
def _get_dataset( self, file_path: str, line_by_line: bool, block_size: int, overwrite_cache: bool, ) -> Dataset: if line_by_line: return LineByLineTextDataset( tokenizer=self.tokenizer, file_path=file_path, block_size=block_size ) else: return TextDataset( tokenizer=self.tokenizer, file_path=file_path, block_size=block_size, overwrite_cache=overwrite_cache, )
def get_dataset(args: DataTrainingArguments, model_name_or_path, tokenizer: PreTrainedTokenizer, evaluate=False): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: if 'dialogpt' in model_name_or_path.lower(): return LineByLinePersonaChatDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) else: return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) else: return TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache)
def main(): tokenizer = BertTokenizer.from_pretrained('vocab/bert-base-chinese-vocab.txt') dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="data/dialogue_lined/multi-sents-further-pretrain/train_test_dialogues.txt", block_size=512, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) training_args = TrainingArguments( output_dir="model/multi-sents-test-further-pretrained-bert", do_train=True, warmup_steps=int(100 * (len(dataset) / 32) * 0.1), #warmup_steps=10000, overwrite_output_dir=True, num_train_epochs=100, #max_steps=100000, per_device_train_batch_size=8, gradient_accumulation_steps=4, save_steps=1000, logging_steps=10, weight_decay=0.01 ) model = BertForMaskedLM.from_pretrained('bert-base-chinese') trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) trainer.train() trainer.save_model('model/multi-sents-test-further-pretrained-bert') return
def main(args): # Import the custom trained tokenizer tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer) # Define the model config = RobertaConfig(vocab_size=32000) model = RobertaForMaskedLM(config=config) # Import the dataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=args.data, block_size=128, ) # Initialize the data collector data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer) # Set all of the training arguments training_args = TrainingArguments( output_dir=args.output, overwrite_output_dir=True, num_train_epochs=10, per_gpu_train_batch_size=24, save_steps=10_000, save_total_limit=10, ) # Train the model trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() # Save the mode trainer.save_model("./roBERTaCODE_{}_{}".format(args.language, args.size))
def train_mod(txt_dir, tokenizer, model_dir): config = RobertaConfig( vocab_size=3305, max_position_embeddings=1024, num_attention_heads=12, num_hidden_layers=6, output_attentions=True, type_vocab_size=1, ) dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=txt_dir, block_size=1024) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=model_dir, overwrite_output_dir=True, num_train_epochs=1000, per_gpu_train_batch_size=16, save_steps=1000, save_total_limit=37, prediction_loss_only=True, ) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=dataset) trainer.train() trainer.save_model(model_dir)
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, inline_meta: str = None, local_rank=-1): file_path = args.eval_data_file if args.webtext: return WebTextPretokenizedDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, inline_meta=inline_meta, local_rank=local_rank) elif args.line_by_line: return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank) else: return TextDataset( tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank, )
def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--config_file", default=None, type=str, required=True, help="Configuration file") args = parser.parse_args() config = TrainModelConfig.load(args.config_file) logger = config.logger() # noqa: F841 dataset = LineByLineTextDataset( tokenizer=config.tokenizer, file_path=config.file_path, block_size=config.block_size ) data_collator = DataCollatorForLanguageModeling( tokenizer=config.tokenizer, mlm=True, mlm_probability=config.mlm_probability ) training_args = TrainingArguments( output_dir=config.saving_folder, overwrite_output_dir=True, num_train_epochs=config.epochs, per_gpu_train_batch_size=config.batch_size_per_gpu, save_steps=config.save_steps, save_total_limit=config.save_total_limit, ) trainer = Trainer( model=config.model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) trainer.train() trainer.save_model(config.saving_folder) config.tokenizer.save_pretrained(config.saving_folder)
for name, param in model.named_parameters(): if freeze_layer in name: print(name) param.requires_grad = False else: pass print('===========================') print('The model has: ', count_parameters(model)) print('===========================') file_path = 'multi-label_train.csv.txt' dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path= file_path, block_size=128) #dataset = load_dataset("./csv_for_ft_new.py", data_files=file_path) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) dir = str(args.resultpath) + str(args.data) + '_' + str(args.LM) + '_e20' + '_b' + str(args.batch_size) training_args = TrainingArguments( do_train=True, do_predict=True, output_dir=dir, overwrite_output_dir=True, num_train_epochs= args.num_train_epochs, per_device_train_batch_size=args.batch_size, save_steps=10000, save_total_limit=2,