def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' worker = scluster.make_task(name=args.name, install_script=install_script, image_name=args.image) if not scluster.running_locally(): worker._run_raw('killall python', ignore_errors=True) worker.upload(__file__) worker.upload('util.py') if args.xray: worker.run('export RAY_USE_XRAY=1') worker.run('ray stop') resources = """--resources='{"ps": 1, "worker": 1}'""" worker.run(f"ray start --head {resources} --redis-port=6379") # worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}") worker.run( f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') print(worker.read('out'))
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = ncluster.make_job(name=args.name, install_script=install_script, image_name=args.image, num_tasks=args.num_workers + args.num_ps) if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) job.upload(__file__) job.upload('util.py') if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') head = job.tasks[0] # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources worker_resource = """--resources='{"worker": 1}'""" head.run(f"ray start --head {worker_resource} --redis-port=6379") for task in job.tasks[1:]: task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}") head.run( f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}' ) print(head.read('out'))
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') if args.nightly: # running locally MacOS print(f"asdfasdf {util.ossystem('uname')}") if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' print(f"asdfasdf got install script {install_script}") else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = scluster.make_job(name=args.name, install_script=install_script, image_name=args.image, num_tasks=2) ps, worker = job.tasks if not scluster.running_locally(): ps._run_raw('killall python', ignore_errors=True) worker._run_raw('killall python', ignore_errors=True) job.upload(__file__) job.upload('util.py') if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run( f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' ) print(worker.read('out'))
os.environ['WANDB_MODE'] = 'dryrun' # all wandb.log are no-op log.console("local-only wandb logging for run " + args.name) wandb.init(project='imagenet18', name=args.name) log.console("initializing logging to run " + args.name) if hasattr(wandb, 'config') and wandb.config is not None: wandb.config['gpus'] = int(os.environ.get('WORLD_SIZE', 1)) try: config = util.text_unpickle(open(args.internal_config_fn).read()) except Exception as e: log.console(f'couldnt open wandb config file with {e}') config = {} config['worker_conda'] = os.path.basename( util.ossystem('echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}')) if hasattr(wandb, 'config') and wandb.config is not None: wandb.config.update(config) util.log_environment() def main(): # os.system('sudo shutdown -c') # cancel previous shutdown command log.console(args) tb.log('sizes/world', dist_utils.env_world_size()) print(args.data) assert os.path.exists(args.data) # need to index validation directory before we start counting the time dataloader.sort_ar(args.data + '/val')
def main(): if args.image_name == "pytorch.imagenet.source.v7": supported_regions = ["us-west-2", "us-east-1", "us-east-2"] if ncluster.get_region() not in supported_regions: raise ValueError( f"required AMI {args.image_name} has only been made available " f"in regions {supported_regions}, but your current region " f"is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)") if args.machines not in schedules: raise ValueError( f"{args.machines} not supported, only support {schedules.keys()}" ) if args.mount_imagenet: datadir = "/data/imagenet" else: datadir = "~/data/imagenet" os.environ["NCLUSTER_AWS_FAST_ROOTDISK"] = "1" # use io2 disk on AWS if args.num_tasks >= 16: if not args.simple_ring_setup: raise ValueError( "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit" ) job = ncluster.make_job( name=args.name, run_name=args.run_name, num_tasks=args.machines, image_name=args.image_name, instance_type=args.instance_type, disk_size=500, spot=args.spot, skip_setup=args.skip_setup, ) task0 = job.tasks[0] # _logdir = task0.logdir # workaround for race condition in creating logdir config = {} for key in os.environ: if re.match(r"^NCLUSTER", key): config["env_" + key] = os.getenv(key) config.update(vars(args)) CUDA_HOME = f"/usr/local/cuda" EFA_HOME = f"/opt/amazon/efa" MPI_HOME = EFA_HOME NPROC_PER_NODE = args.nproc_per_node if NPROC_PER_NODE > task0.num_gpus: raise ValueError( f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present" ) NUM_GPUS = NPROC_PER_NODE * args.num_tasks config["NUM_GPUS"] = NUM_GPUS config["internal_id"] = u.get_account_number() config["internal_alias"] = u.get_account_name() config["region"] = u.get_region() config["zone"] = u.get_zone() config["launch_user"] = os.environ.get("USER", "") config["cmd"] = " ".join(sys.argv) config["launcher_conda"] = util.ossystem( 'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}') config["launcher_cmd"] = "python " + " ".join(sys.argv) config["logdir"] = job.logdir pickled_config = util.text_pickle(config) if args.log_all_workers: job.write(args.internal_config_fn, pickled_config) else: job.tasks[0].write(args.internal_config_fn, pickled_config) if args.mount_imagenet: if not u.get_zone(): raise ValueError("Must specify zone when reusing EBS volumes") mount_imagenet(job) if not args.skip_setup: job.run( "rm -f *.py") # remove files backed into imagenet18 release image job.run("conda init") # missing .bashrc job.run( f"{{ source activate {args.conda_env} && " f"bash setup.sh && pip install -U protobuf ; }} && {{ killall python || echo hi ; }} " ) if args.pytorch_nightly: job.run( "conda install -y -c pytorch pytorch-nightly && bash setup.sh") else: job.run([ f"source ~/.bashrc && conda activate {args.conda_env}", f"killall python || echo hi" ]) job.rsync(".") if args.efa: if "efa" not in args.image_name: raise ValueError("make sure we use EFA-enabled image") unused_hosts_str, hosts_file_str = util.setup_mpi( job, skip_ssh_setup=args.skip_setup) if not args.skip_setup: task0.write(HOSTS_SLOTS_FN, hosts_file_str) env_params = get_nccl_params(args.machines, args.nproc_per_node) if args.cuda_debug: env_params += "CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO " else: env_params += "NCCL_DEBUG=INFO " env_params += " OMP_NUM_THREADS=1 " if args.pytorch_use_spawn: if not args.pytorch_nightly: raise ValueError() env_params += " PYTORCH_USE_SPAWN=1 " if "WANDB_API_KEY" in os.environ: env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} " # Training script args default_params = [ datadir, "--fp16", "--logdir", job.logdir, "--name", f"{args.run_name}-{util.random_id()}", "--distributed", "--init-bn0", "--no-bn-wd", "--log_all_workers", args.log_all_workers, ] params = ["--phases", util.text_pickle(schedules[args.machines])] training_params = default_params + params training_params = " ".join(map(format_params, training_params)) if not args.efa: # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = ( f"--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} " f"--node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}" ) cmd = ( f"{env_params} python -m torch.distributed.launch {dist_params} " f"training/train_imagenet_nv.py {training_params}") task.run( f"echo {cmd} > {job.logdir}/task-{i}.cmd") # save command-line task.run(cmd, non_blocking=True) else: FI_PROVIDER = "efa" if args.pseudo_efa: FI_PROVIDER = "sockets" local_env = util.format_env_export( LOCAL_RANK="$OMPI_COMM_WORLD_LOCAL_RANK", RANK="$OMPI_COMM_WORLD_RANK", WORLD_SIZE="$OMPI_COMM_WORLD_SIZE", MASTER_ADDR=task0.ip, MASTER_PORT=6016, ) mpi_env = util.format_env_x( FI_PROVIDER= FI_PROVIDER, # Enables running nccl-tests using EFA provider. FI_OFI_RXR_RX_COPY_UNEXP= 1, # Disables using bounce buffers for unexpected messages. FI_OFI_RXR_RX_COPY_OOO= 1, # Disables using bounce buffers for out of order messages. FI_EFA_MR_CACHE_ENABLE=1, # Enables memory region caching. FI_OFI_RXR_INLINE_MR_ENABLE= 1, # Enables inline memory registration of data buffers. NCCL_TREE_THRESHOLD=10 * 4294967296, # force tree for everything under 40GB LD_LIBRARY_PATH= f"{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64", NCCL_DEBUG="INFO", OMP_NUM_THREADS=1, WANDB_API_KEY=os.environ.get("WANDB_API_KEY", ""), PYTORCH_USE_SPAWN=args.pytorch_use_spawn, NO_WANDB=args.pytorch_use_spawn, ) if args.no_op: worker_script_fn = "training/env_test.py" else: worker_script_fn = "training/train_imagenet_nv.py" local_cmd = [ f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ", f"python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK", ] local_cmd = " ".join(local_cmd) cmd = [ f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ", f"{mpi_env} ", f"--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ", f"--bind-to none ", f"bash -c '{local_cmd}'", ] cmd = " ".join(cmd) task0.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")