Esempio n. 1
0
    def test_clm(self, stage):
        # this test exercises model.resize_token_embeddings() which requires param gathering outside
        # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`

        data_dir = self.tests_dir / "fixtures"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {GPT2_TINY}
            --train_file {data_dir}/sample_text.txt
            --validation_file {data_dir}/sample_text.txt
            --output_dir {output_dir}
            --overwrite_output_dir
            --do_train
            --do_eval
            --max_train_samples 16
            --max_eval_samples 16
            --per_device_train_batch_size 2
            --per_device_eval_batch_size 2
            --num_train_epochs 1
            --warmup_steps 8
            --block_size 64
            --fp16
            --report_to none
            """.split()

        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
        script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
        launcher = get_launcher(distributed=True)

        cmd = launcher + script + args + ds_args
        # keep for quick debug
        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
        execute_subprocess_async(cmd, env=self.get_env())
Esempio n. 2
0
    def test_clm_from_config_zero3(self):
        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called

        data_dir = self.tests_dir / "fixtures"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_type gpt2
            --tokenizer_name sshleifer/tiny-gpt2
            --train_file {data_dir}/sample_text.txt
            --validation_file {data_dir}/sample_text.txt
            --output_dir {output_dir}
            --overwrite_output_dir
            --do_train
            --max_train_samples 4
            --per_device_train_batch_size 2
            --num_train_epochs 1
            --warmup_steps 8
            --block_size 8
            --fp16
            --report_to none
            """.split()

        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split(
        )
        script = [
            f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"
        ]
        launcher = self.get_launcher(distributed=True)

        cmd = launcher + script + args + ds_args
        # keep for quick debug
        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
        with CaptureStderr() as cs:
            execute_subprocess_async(cmd, env=self.get_env())
        assert "Detected DeepSpeed ZeRO-3" in cs.err
Esempio n. 3
0
    def run_trainer(
        self,
        eval_steps: int,
        max_len: str,
        model_name: str,
        num_train_epochs: int,
        distributed: bool = True,
        extra_args_str: str = None,
        remove_args_str: str = None,
    ):
        data_dir = self.examples_dir / "test_data/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {model_name}
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --output_dir {output_dir}
            --overwrite_output_dir
            --max_train_samples 8
            --max_val_samples 8
            --max_source_length {max_len}
            --max_target_length {max_len}
            --val_max_target_length {max_len}
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
            --learning_rate 3e-3
            --warmup_steps 8
            --predict_with_generate
            --logging_steps 0
            --save_steps {str(eval_steps)}
            --group_by_length
            --label_smoothing_factor 0.1
            --adafactor
            --task translation
            --target_lang ro_RO
            --source_lang en_XX
        """.split()

        if extra_args_str is not None:
            args.extend(extra_args_str.split())

        if remove_args_str is not None:
            remove_args = remove_args_str.split()
            args = [x for x in args if x not in remove_args]

        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split(
        )
        script = [f"{self.examples_dir_str}/seq2seq/run_seq2seq.py"]
        num_gpus = get_gpu_count() if distributed else 1
        launcher = f"deepspeed --num_gpus {num_gpus}".split()

        cmd = launcher + script + args + ds_args
        # keep for quick debug
        # print(" ".join([f"PYTHONPATH={self.src_dir_str}"] +cmd)); die
        execute_subprocess_async(cmd, env=self.get_env())

        return output_dir
Esempio n. 4
0
    def _run_finetune(self, gpus: int, distributed_retriever: str = "pytorch"):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

        tmp_dir = self.get_auto_remove_tmp_dir()
        output_dir = os.path.join(tmp_dir, "output")
        data_dir = os.path.join(tmp_dir, "data")
        self._create_dummy_data(data_dir=data_dir)

        testargs = f"""
                --data_dir {data_dir} \
                --output_dir {output_dir} \
                --model_name_or_path facebook/rag-sequence-base \
                --model_type rag_sequence \
                --do_train \
                --do_predict \
                --n_val -1 \
                --val_check_interval 1.0 \
                --train_batch_size 2 \
                --eval_batch_size 1 \
                --max_source_length 25 \
                --max_target_length 25 \
                --val_max_target_length 25 \
                --test_max_target_length 25 \
                --label_smoothing 0.1 \
                --dropout 0.1 \
                --attention_dropout 0.1 \
                --weight_decay 0.001 \
                --adam_epsilon 1e-08 \
                --max_grad_norm 0.1 \
                --lr_scheduler polynomial \
                --learning_rate 3e-04 \
                --num_train_epochs 1 \
                --warmup_steps 4 \
                --gradient_accumulation_steps 1 \
                --distributed-port 8787 \
                --use_dummy_dataset 1 \
                --distributed_retriever {distributed_retriever} \
            """.split()

        if gpus > 0:
            testargs.append(f"--gpus={gpus}")
            if is_apex_available():
                testargs.append("--fp16")
        else:
            testargs.append("--gpus=0")
            testargs.append("--distributed_backend=ddp_cpu")
            testargs.append("--num_processes=2")

        cmd = [sys.executable,
               str(Path(finetune_rag.__file__).resolve())] + testargs
        execute_subprocess_async(cmd, env=self.get_env())

        metrics_save_path = os.path.join(output_dir, "metrics.json")
        with open(metrics_save_path) as f:
            result = json.load(f)
        return result
Esempio n. 5
0
    def run_trainer(self,
                    eval_steps: int,
                    max_len: str,
                    model_name: str,
                    num_train_epochs: int,
                    distributed: bool = False):
        data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {model_name}
            --data_dir {data_dir}
            --output_dir {output_dir}
            --overwrite_output_dir
            --n_train 8
            --n_val 8
            --max_source_length {max_len}
            --max_target_length {max_len}
            --val_max_target_length {max_len}
            --do_train
            --do_eval
            --do_predict
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
            --learning_rate 3e-3
            --warmup_steps 8
            --evaluation_strategy steps
            --predict_with_generate
            --logging_steps 0
            --save_steps {str(eval_steps)}
            --eval_steps {str(eval_steps)}
            --sortish_sampler
            --label_smoothing 0.1
            --adafactor
            --task translation
            --tgt_lang ro_RO
            --src_lang en_XX
        """.split()
        # --eval_beams  2

        if distributed:
            n_gpu = get_gpu_count()
            distributed_args = f"""
                -m torch.distributed.launch
                --nproc_per_node={n_gpu}
                {self.test_file_dir}/finetune_trainer.py
            """.split()
            cmd = [sys.executable] + distributed_args + args
            execute_subprocess_async(cmd, env=self.get_env())
        else:
            testargs = ["finetune_trainer.py"] + args
            with patch.object(sys, "argv", testargs):
                main()

        return output_dir
    def _run_finetune(self, gpus: int, distributed_retriever: str = "pytorch"):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

        tmp_dir = self.get_auto_remove_tmp_dir()
        output_dir = os.path.join(tmp_dir, "output")
        data_dir = "/dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_generation_structure_two"
        # self._create_dummy_data(data_dir=data_dir)

        testargs = f"""
                --data_dir /dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_generation_structure_two \
                --output_dir output \
                --index_name custom \
                --index_path /dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_knowledge_dataset-token-dpr_new/my_knowledge_dataset_hnsw_index.faiss \
                --passages_path /dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_knowledge_dataset-token-dpr_new/my_knowledge_dataset \
                --model_name_or_path facebook/rag-token-base \
                --model_type rag_token \
                --do_train \
                --do_predict \
                --n_val -1 \
                --val_check_interval 1.0 \
                --train_batch_size 4 \
                --eval_batch_size 8 \
                --max_source_length 25 \
                --max_target_length 25 \
                --val_max_target_length 25 \
                --test_max_target_length 25 \
                --label_smoothing 0.1 \
                --dropout 0.1 \
                --attention_dropout 0.1 \
                --weight_decay 0.001 \
                --adam_epsilon 1e-08 \
                --max_grad_norm 0.1 \
                --lr_scheduler polynomial \
                --learning_rate 3e-04 \
                --num_train_epochs 1 \
                --warmup_steps 4 \
                --gradient_accumulation_steps 1 \
                --distributed-port 8787 \
                --use_dummy_dataset 1 \
                --distributed_retriever {distributed_retriever} \
            """.split()

        if gpus > 0:
            testargs.append(f"--gpus={gpus}")
            if is_apex_available():
                testargs.append("--fp16")
        else:
            testargs.append("--gpus=0")
            # testargs.append("--distributed_backend=ddp_cpu")
            testargs.append("--num_processes=1")

        cmd = [sys.executable,
               str(Path(finetune_rag.__file__).resolve())] + testargs
        execute_subprocess_async(cmd, env=self.get_env())
Esempio n. 7
0
    def test_trainer(self):

        distributed_args = f"""
            -m torch.distributed.launch
            --nproc_per_node={torch.cuda.device_count()}
            {self.test_file_dir}/test_trainer_distributed.py
        """.split()
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"--output_dir {output_dir}".split()
        cmd = [sys.executable] + distributed_args + args
        execute_subprocess_async(cmd, env=self.get_env())
    def _test_distiller_cli_fork(self, updates, check_contents=True):
        default_updates = dict(
            label_smoothing=0.0,
            early_stopping_patience=-1,
            train_batch_size=1,
            eval_batch_size=2,
            max_epochs=2,
            alpha_mlm=0.2,
            alpha_ce=0.8,
            do_predict=True,
            model_name_or_path="sshleifer/tinier_bart",
            teacher=CHEAP_ARGS["model_name_or_path"],
            val_check_interval=0.5,
        )
        default_updates.update(updates)
        args_d: dict = CHEAP_ARGS.copy()
        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
        output_dir = self.get_auto_remove_tmp_dir()
        args_d.update(data_dir=tmp_dir, output_dir=output_dir, **default_updates)

        def convert(k, v):
            if k in ["tgt_suffix", "server_ip", "server_port", "out", "n_tpu_cores"]:
                return ""
            if v is False or v is None:
                return ""
            if v is True:  # or len(str(v))==0:
                return f"--{k}"
            return f"--{k}={v}"

        cli_args = [x for x in (convert(k, v) for k, v in args_d.items()) if len(x)]
        cmd = [sys.executable, f"{self.test_file_dir}/distillation.py"] + cli_args
        execute_subprocess_async(cmd, env=self.get_env())

        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
        ckpt_files = [p for p in contents if p.endswith("ckpt")]
        assert len(ckpt_files) > 0

        self.assertIn("test_generations.txt", contents)
        self.assertIn("test_results.txt", contents)

        # get the following from the module, (we don't have access to `model` here)
        metrics_save_path = os.path.join(output_dir, "metrics.json")
        val_metric = "rouge2"

        metrics = load_json(metrics_save_path)
        # {'test': [{'test_avg_loss': 10.63731575012207, 'test_avg_rouge1': 0.0, 'test_avg_rouge2': 0.0, 'test_avg_rougeL': 0.0, 'test_avg_gen_time': 0.1822289228439331, 'test_avg_gen_len': 142.0, 'step_count': 1}]}
        print(metrics)
        last_step_stats = metrics["val"][-1]
        self.assertGreaterEqual(last_step_stats["val_avg_gen_time"], 0.01)
        self.assertIsInstance(last_step_stats[f"val_avg_{val_metric}"], float)
        self.assertEqual(len(metrics["test"]), 1)
        desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1)
        self.assertEqual(len(metrics["val"]), desired_n_evals)
    def run_trainer(
        self,
        stage: str,
        model_name: str,
        eval_steps: int = 10,
        num_train_epochs: int = 1,
        distributed: bool = True,
        fp16: bool = True,
    ):

        output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
        args = f"""
            --model_name_or_path {model_name}
            --dataset_name patrickvonplaten/librispeech_asr_dummy
            --dataset_config_name clean
            --train_split_name validation
            --validation_split_name validation
            --output_dir {output_dir}
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 2
            --per_device_eval_batch_size 2
            --evaluation_strategy steps
            --learning_rate 5e-4
            --warmup_steps 8
            --orthography timit
            --preprocessing_num_workers 1
            --group_by_length
            --freeze_feature_extractor
            --report_to none
            --logging_steps 0
            --save_steps 0
            --eval_steps {eval_steps}
            --report_to none
        """.split()

        if fp16:
            args.extend(["--fp16"])

        # currently ds_config_wav2vec2_zero.json requires "zero_optimization.find_unused_parameters": true,
        # hence the separate config files
        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_wav2vec2_{stage}.json".split(
        )
        script = [
            f"{self.examples_dir_str}/research_projects/wav2vec2/run_asr.py"
        ]
        launcher = self.get_launcher(distributed)

        cmd = launcher + script + args + ds_args
        # keep for quick debug
        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
        execute_subprocess_async(cmd, env=self.get_env())

        return output_dir
Esempio n. 10
0
    def test_load_best_model(self, stage, dtype):
        # this test exercises --load_best_model_at_end - the key is being able to resume after some training

        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {T5_TINY}
            --tokenizer_name {T5_TINY}
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --output_dir {output_dir}
            --overwrite_output_dir
            --source_lang en
            --target_lang ro
            --do_train
            --max_train_samples 3
            --do_eval
            --max_eval_samples 1
            --logging_strategy steps
            --logging_steps 1
            --evaluation_strategy steps
            --eval_steps 1
            --save_strategy steps
            --save_steps 1
            --load_best_model_at_end
            --per_device_train_batch_size 1
            --per_device_eval_batch_size 1
            --num_train_epochs 1
            --report_to none
            """.split()
        args.extend(["--source_prefix", "translate English to Romanian: "])

        args.extend([f"--{dtype}"])

        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split(
        )
        script = [
            f"{self.examples_dir_str}/pytorch/translation/run_translation.py"
        ]
        launcher = get_launcher(distributed=False)

        cmd = launcher + script + args + ds_args
        # keep for quick debug
        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
        with CaptureStd() as cs:
            execute_subprocess_async(cmd, env=self.get_env())
        # enough to test it didn't fail
        self.assertIn("DeepSpeed info", cs.out)
Esempio n. 11
0
    def test_zero_to_fp32(self, stage, task):
        # testing the ability to do a run followed by recovery of full fp32 weights

        cmd, output_dir = self.get_task_cmd(task, stage)

        # 1. generate the checkpoint
        cmd += "--save_steps 1".split()
        # keep for quick debug
        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] + cmd)); die
        execute_subprocess_async(cmd, env=self.get_env())

        # 2. test that the fp32 weights get reconsolidated
        chkpt_dir = f"{output_dir}/checkpoint-1"
        recovered_model_path = f"{chkpt_dir}/out.bin"
        cmd = f"{chkpt_dir}/zero_to_fp32.py {chkpt_dir} {recovered_model_path}"
        # keep for quick debug
        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
        subprocess.check_call(cmd, shell=True)
        assert os.path.exists(
            recovered_model_path), f"{recovered_model_path} was not found"
Esempio n. 12
0
    def test_distributed_eval(self):
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name Helsinki-NLP/opus-mt-en-ro
            --save_dir {output_dir}
            --data_dir {self.test_file_dir_str}/test_data/wmt_en_ro
            --num_beams 2
            --task translation
        """.split()

        # we want this test to run even if there is only one GPU, but if there are more we use them all
        n_gpu = get_gpu_count()
        distributed_args = f"""
            -m torch.distributed.launch
            --nproc_per_node={n_gpu}
            {self.test_file_dir}/run_distributed_eval.py
        """.split()
        cmd = [sys.executable] + distributed_args + args
        execute_subprocess_async(cmd, env=self.get_env())

        metrics_save_path = os.path.join(output_dir, "test_bleu.json")
        metrics = load_json(metrics_save_path)
        # print(metrics)
        self.assertGreaterEqual(metrics["bleu"], 25)
Esempio n. 13
0
    def run_trainer(
        self,
        eval_steps: int,
        max_len: int,
        model_name: str,
        num_train_epochs: int,
        learning_rate: float = 3e-3,
        distributed: bool = False,
        extra_args_str: str = None,
        predict_with_generate: bool = True,
    ):
        data_dir = self.examples_dir / "test_data/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {model_name}
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
            --output_dir {output_dir}
            --overwrite_output_dir
            --max_train_samples 8
            --max_val_samples 8
            --max_source_length {max_len}
            --max_target_length {max_len}
            --val_max_target_length {max_len}
            --do_train
            --do_eval
            --do_predict
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
            --learning_rate {learning_rate}
            --warmup_steps 8
            --evaluation_strategy steps
            --logging_steps 0
            --eval_steps {str(eval_steps)}
            --save_steps {str(eval_steps)}
            --group_by_length
            --label_smoothing_factor 0.1
            --adafactor
            --target_lang ro_RO
            --source_lang en_XX
        """
        if predict_with_generate:
            args += "--predict_with_generate"

        args = args.split()

        if extra_args_str is not None:
            args.extend(extra_args_str.split())

        if distributed:
            n_gpu = get_gpu_count()
            distributed_args = f"""
                -m torch.distributed.launch
                --nproc_per_node={n_gpu}
                {self.examples_dir_str}/seq2seq/run_translation.py
            """.split()
            cmd = [sys.executable] + distributed_args + args
            execute_subprocess_async(cmd, env=self.get_env())
        else:
            testargs = ["run_translation.py"] + args
            with patch.object(sys, "argv", testargs):
                main()

        return output_dir
Esempio n. 14
0
    def run_trainer(
        self,
        stage: str,
        model_name: str,
        eval_steps: int = 10,
        num_train_epochs: int = 1,
        do_train: bool = False,
        do_eval: bool = True,
        distributed: bool = True,
        fp16: bool = True,
        extra_args_str: str = None,
        remove_args_str: str = None,
    ):
        max_len = 32
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {model_name}
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --output_dir {output_dir}
            --overwrite_output_dir
            --max_source_length {max_len}
            --max_target_length {max_len}
            --val_max_target_length {max_len}
            --warmup_steps 8
            --predict_with_generate
            --logging_steps 0
            --save_steps 0
            --eval_steps {eval_steps}
            --group_by_length
            --label_smoothing_factor 0.1
            --source_lang en
            --target_lang ro
            --report_to none
        """.split()
        args.extend(["--source_prefix", '"translate English to Romanian: "'])

        if fp16:
            args.extend(["--fp16"])

        actions = 0
        if do_train:
            actions += 1
            args.extend(f"""
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --max_train_samples 16
            --per_device_train_batch_size 2
            --learning_rate 3e-3
            """.split())

        if do_eval:
            actions += 1
            args.extend("""
            --do_eval
            --max_eval_samples 16
            --per_device_eval_batch_size 2
            """.split())

        assert actions > 0, "need at least do_train or do_eval for the test to run"

        if extra_args_str is not None:
            args.extend(extra_args_str.split())

        # currently only works for bool args
        if remove_args_str is not None:
            remove_args = remove_args_str.split()
            args = [x for x in args if x not in remove_args]

        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split(
        )
        script = [
            f"{self.examples_dir_str}/pytorch/translation/run_translation.py"
        ]
        launcher = self.get_launcher(distributed)

        cmd = launcher + script + args + ds_args
        # keep for quick debug
        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
        execute_subprocess_async(cmd, env=self.get_env())

        return output_dir
Esempio n. 15
0
    def run_trainer(
        self,
        eval_steps: int,
        max_len: int,
        model_name: str,
        num_train_epochs: int,
        learning_rate: float = 3e-3,
        distributed: bool = False,
        extra_args_str: str = None,
        predict_with_generate: bool = True,
        do_train: bool = True,
        do_eval: bool = True,
        do_predict: bool = True,
    ):
        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args_train = f"""
            --model_name_or_path {model_name}
            --train_file {data_dir}/train.json
            --validation_file {data_dir}/val.json
            --test_file {data_dir}/test.json
            --output_dir {output_dir}
            --overwrite_output_dir
            --max_train_samples 8
            --max_source_length {max_len}
            --max_target_length {max_len}
            --do_train
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
            --learning_rate {learning_rate}
            --warmup_steps 8
            --logging_steps 0
            --logging_strategy no
            --save_steps {str(eval_steps)}
            --group_by_length
            --label_smoothing_factor 0.1
            --adafactor
            --target_lang ro_RO
            --source_lang en_XX
        """

        args_eval = f"""
            --do_eval
            --per_device_eval_batch_size 4
            --max_eval_samples 8
            --val_max_target_length {max_len}
            --evaluation_strategy steps
            --eval_steps {str(eval_steps)}
        """

        args_predict = """
            --do_predict
        """

        args = ""
        if do_train:
            args += args_train

        if do_eval:
            args += args_eval

        if do_predict:
            args += args_predict

        if predict_with_generate:
            args += "--predict_with_generate"

        args = args.split()

        if extra_args_str is not None:
            args.extend(extra_args_str.split())

        if distributed:
            n_gpu = get_gpu_count()
            master_port = get_torch_dist_unique_port()
            distributed_args = f"""
                -m torch.distributed.launch
                --nproc_per_node={n_gpu}
                --master_port={master_port}
                {self.examples_dir_str}/pytorch/translation/run_translation.py
            """.split()
            cmd = [sys.executable] + distributed_args + args
            # keep for quick debug
            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
            execute_subprocess_async(cmd, env=self.get_env())
        else:
            testargs = ["run_translation.py"] + args
            with patch.object(sys, "argv", testargs):
                main()

        return output_dir
Esempio n. 16
0
    def run_trainer(
        self,
        eval_steps: int,
        max_len: str,
        model_name: str,
        num_train_epochs: int,
        distributed: bool = False,
        extra_args_str: str = None,
        remove_args_str: str = None,
    ):
        data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {model_name}
            --data_dir {data_dir}
            --output_dir {output_dir}
            --overwrite_output_dir
            --n_train 8
            --n_val 8
            --max_source_length {max_len}
            --max_target_length {max_len}
            --val_max_target_length {max_len}
            --do_train
            --do_eval
            --do_predict
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
            --learning_rate 3e-3
            --warmup_steps 8
            --evaluation_strategy steps
            --predict_with_generate
            --logging_steps 0
            --save_steps {str(eval_steps)}
            --eval_steps {str(eval_steps)}
            --group_by_length
            --label_smoothing_factor 0.1
            --adafactor
            --task translation
            --tgt_lang ro_RO
            --src_lang en_XX
        """.split()
        # --eval_beams  2

        if extra_args_str is not None:
            args.extend(extra_args_str.split())

        if remove_args_str is not None:
            remove_args = remove_args_str.split()
            args = [x for x in args if x not in remove_args]

        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split(
        )
        distributed_args = f"""
            {self.test_file_dir}/../../seq2seq/finetune_trainer.py
        """.split()
        cmd = ["deepspeed"] + distributed_args + args + ds_args
        # keep for quick debug
        # print(" ".join(cmd)); die
        execute_subprocess_async(cmd, env=self.get_env())

        return output_dir