def run_trainer( self, eval_steps: int, max_len: int, model_name: str, num_train_epochs: int, learning_rate: float = 3e-3, distributed: bool = False, extra_args_str: str = None, predict_with_generate: bool = True, ): data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --train_file {data_dir}/train.json --validation_file {data_dir}/val.json --test_file {data_dir}/test.json --output_dir {output_dir} --overwrite_output_dir --max_train_samples 8 --max_val_samples 8 --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --do_train --do_eval --do_predict --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --learning_rate {learning_rate} --warmup_steps 8 --evaluation_strategy steps --logging_steps 0 --eval_steps {str(eval_steps)} --save_steps {str(eval_steps)} --group_by_length --label_smoothing_factor 0.1 --adafactor --target_lang ro_RO --source_lang en_XX """ if predict_with_generate: args += "--predict_with_generate" args = args.split() if extra_args_str is not None: args.extend(extra_args_str.split()) if distributed: n_gpu = get_gpu_count() distributed_args = f""" -m torch.distributed.launch --nproc_per_node={n_gpu} {self.examples_dir_str}/pytorch/translation/run_translation.py """.split() cmd = [sys.executable] + distributed_args + args execute_subprocess_async(cmd, env=self.get_env()) else: testargs = ["run_translation.py"] + args with patch.object(sys, "argv", testargs): main() return output_dir
def run_trainer( self, stage: str, model_name: str, eval_steps: int = 10, num_train_epochs: int = 1, do_train: bool = False, do_eval: bool = True, distributed: bool = True, extra_args_str: str = None, remove_args_str: str = None, ): max_len = 32 data_dir = self.examples_dir / "test_data/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --train_file {data_dir}/train.json --validation_file {data_dir}/val.json --output_dir {output_dir} --overwrite_output_dir --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --warmup_steps 8 --predict_with_generate --logging_steps 0 --save_steps 0 --eval_steps {eval_steps} --group_by_length --label_smoothing_factor 0.1 --adafactor --source_lang en --target_lang ro """.split() args.extend(["--source_prefix", '"translate English to Romanian: "']) actions = 0 if do_train: actions += 1 args.extend( f""" --do_train --num_train_epochs {str(num_train_epochs)} --max_train_samples 100 --per_device_train_batch_size 2 --learning_rate 3e-3 """.split() ) if do_eval: actions += 1 args.extend( """ --do_eval --max_val_samples 100 --per_device_eval_batch_size 2 """.split() ) assert actions > 0, "need at least do_train or do_eval for the test to run" if extra_args_str is not None: args.extend(extra_args_str.split()) # currently only works for bool args if remove_args_str is not None: remove_args = remove_args_str.split() args = [x for x in args if x not in remove_args] ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() script = [f"{self.examples_dir_str}/seq2seq/run_translation.py"] num_gpus = get_gpu_count() if distributed else 1 launcher = f"deepspeed --num_gpus {num_gpus}".split() cmd = launcher + script + args + ds_args # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die execute_subprocess_async(cmd, env=self.get_env()) return output_dir
def run_trainer( self, eval_steps: int, max_len: str, model_name: str, num_train_epochs: int, distributed: bool = False, extra_args_str: str = None, ): data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --data_dir {data_dir} --output_dir {output_dir} --overwrite_output_dir --n_train 8 --n_val 8 --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --do_train --do_eval --do_predict --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --learning_rate 3e-3 --warmup_steps 8 --evaluation_strategy steps --predict_with_generate --logging_steps 0 --save_steps {str(eval_steps)} --eval_steps {str(eval_steps)} --sortish_sampler --label_smoothing 0.1 --adafactor --task translation --tgt_lang ro_RO --src_lang en_XX """.split() # --eval_beams 2 if extra_args_str is not None: args.extend(extra_args_str.split()) if distributed: n_gpu = get_gpu_count() distributed_args = f""" -m torch.distributed.launch --nproc_per_node={n_gpu} {self.test_file_dir}/finetune_trainer.py """.split() cmd = [sys.executable] + distributed_args + args execute_subprocess_async(cmd, env=self.get_env()) else: testargs = ["finetune_trainer.py"] + args with patch.object(sys, "argv", testargs): main() return output_dir