Ejemplo n.º 1
0
 def test_tpu(self):
     distributed_args = f"""
         {self.test_dir}/xla_spawn.py
         --num_cores 8
         {self.test_file_path}
     """.split()
     cmd = [sys.executable] + distributed_args
     execute_subprocess_async(cmd, env=os.environ.copy())
Ejemplo n.º 2
0
 def test_ddp_kwargs(self):
     distributed_args = f"""
         -m torch.distributed.launch
         --nproc_per_node={torch.cuda.device_count()}
         --use_env
         {inspect.getfile(self.__class__)}
     """.split()
     cmd = [sys.executable] + distributed_args
     execute_subprocess_async(cmd, env=os.environ.copy())
Ejemplo n.º 3
0
 def test_multi_gpu(self):
     print(f"Found {torch.cuda.device_count()} devices.")
     distributed_args = f"""
         -m torch.distributed.launch
         --nproc_per_node={torch.cuda.device_count()}
         --use_env
         {self.test_file_path}
     """.split()
     cmd = [sys.executable] + distributed_args
     execute_subprocess_async(cmd, env=os.environ.copy())
Ejemplo n.º 4
0
def test_command(args):
    script_name = os.path.sep.join(
        __file__.split(os.path.sep)[:-2] + ["test_utils", "test_script.py"])

    test_args = f"""
        --config_file={args.config_file} {script_name}
    """.split()
    cmd = ["accelerate-launch"] + test_args
    result = execute_subprocess_async(cmd, env=os.environ.copy())
    if result.returncode == 0:
        print(
            "Test is a success! You are ready for your distributed training!")