def test_tpu(self): distributed_args = f""" {self.test_dir}/xla_spawn.py --num_cores 8 {self.test_file_path} """.split() cmd = [sys.executable] + distributed_args execute_subprocess_async(cmd, env=os.environ.copy())
def test_ddp_kwargs(self): distributed_args = f""" -m torch.distributed.launch --nproc_per_node={torch.cuda.device_count()} --use_env {inspect.getfile(self.__class__)} """.split() cmd = [sys.executable] + distributed_args execute_subprocess_async(cmd, env=os.environ.copy())
def test_multi_gpu(self): print(f"Found {torch.cuda.device_count()} devices.") distributed_args = f""" -m torch.distributed.launch --nproc_per_node={torch.cuda.device_count()} --use_env {self.test_file_path} """.split() cmd = [sys.executable] + distributed_args execute_subprocess_async(cmd, env=os.environ.copy())
def test_command(args): script_name = os.path.sep.join( __file__.split(os.path.sep)[:-2] + ["test_utils", "test_script.py"]) test_args = f""" --config_file={args.config_file} {script_name} """.split() cmd = ["accelerate-launch"] + test_args result = execute_subprocess_async(cmd, env=os.environ.copy()) if result.returncode == 0: print( "Test is a success! You are ready for your distributed training!")