def run_launcher():
  import scluster

  if args.aws:
    scluster.set_backend('aws')

  if args.nightly:
    # running locally MacOS
    if 'Darwin' in util.ossystem('uname') and not args.aws:
      install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
    else:
      install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
  else:
    install_script = 'pip install ray'

  worker = scluster.make_task(name=args.name,
                              install_script=install_script,
                              image_name=args.image)
  if not scluster.running_locally():
    worker._run_raw('killall python', ignore_errors=True)
  worker.upload(__file__)
  worker.upload('util.py')
  if args.xray:
    worker.run('export RAY_USE_XRAY=1')
  worker.run('ray stop')

  resources = """--resources='{"ps": 1, "worker": 1}'"""
  worker.run(f"ray start --head {resources} --redis-port=6379")
  #  worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}")
  worker.run(
    f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
  print(worker.read('out'))
Exemple #2
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = ncluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            num_tasks=args.num_workers + args.num_ps)
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    job.upload(__file__)
    job.upload('util.py')
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    head = job.tasks[0]

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    worker_resource = """--resources='{"worker": 1}'"""
    head.run(f"ray start --head {worker_resource} --redis-port=6379")

    for task in job.tasks[1:]:
        task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}")

    head.run(
        f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}'
    )

    print(head.read('out'))
Exemple #3
0
def run_launcher():
    import scluster

    if args.aws:
        scluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        print(f"asdfasdf {util.ossystem('uname')}")
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
            print(f"asdfasdf got install script {install_script}")
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = scluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            num_tasks=2)
    ps, worker = job.tasks
    if not scluster.running_locally():
        ps._run_raw('killall python', ignore_errors=True)
        worker._run_raw('killall python', ignore_errors=True)

    job.upload(__file__)
    job.upload('util.py')
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""

    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(
        f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
    print(worker.read('out'))
Exemple #4
0
        os.environ['WANDB_MODE'] = 'dryrun'  # all wandb.log are no-op
        log.console("local-only wandb logging for run " + args.name)
    wandb.init(project='imagenet18', name=args.name)
    log.console("initializing logging to run " + args.name)

if hasattr(wandb, 'config') and wandb.config is not None:
    wandb.config['gpus'] = int(os.environ.get('WORLD_SIZE', 1))

try:
    config = util.text_unpickle(open(args.internal_config_fn).read())
except Exception as e:
    log.console(f'couldnt open wandb config file with {e}')
    config = {}

config['worker_conda'] = os.path.basename(
    util.ossystem('echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}'))
if hasattr(wandb, 'config') and wandb.config is not None:
    wandb.config.update(config)
util.log_environment()


def main():
    # os.system('sudo shutdown -c')  # cancel previous shutdown command
    log.console(args)
    tb.log('sizes/world', dist_utils.env_world_size())

    print(args.data)
    assert os.path.exists(args.data)

    # need to index validation directory before we start counting the time
    dataloader.sort_ar(args.data + '/val')
Exemple #5
0
def main():
    if args.image_name == "pytorch.imagenet.source.v7":
        supported_regions = ["us-west-2", "us-east-1", "us-east-2"]
        if ncluster.get_region() not in supported_regions:
            raise ValueError(
                f"required AMI {args.image_name} has only been made available "
                f"in regions {supported_regions}, but your current region "
                f"is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)")

        if args.machines not in schedules:
            raise ValueError(
                f"{args.machines} not supported, only support {schedules.keys()}"
            )

    if args.mount_imagenet:
        datadir = "/data/imagenet"
    else:
        datadir = "~/data/imagenet"
        os.environ["NCLUSTER_AWS_FAST_ROOTDISK"] = "1"  # use io2 disk on AWS

    if args.num_tasks >= 16:
        if not args.simple_ring_setup:
            raise ValueError(
                "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit"
            )

    job = ncluster.make_job(
        name=args.name,
        run_name=args.run_name,
        num_tasks=args.machines,
        image_name=args.image_name,
        instance_type=args.instance_type,
        disk_size=500,
        spot=args.spot,
        skip_setup=args.skip_setup,
    )

    task0 = job.tasks[0]
    # _logdir = task0.logdir  # workaround for race condition in creating logdir

    config = {}
    for key in os.environ:
        if re.match(r"^NCLUSTER", key):
            config["env_" + key] = os.getenv(key)
    config.update(vars(args))

    CUDA_HOME = f"/usr/local/cuda"
    EFA_HOME = f"/opt/amazon/efa"
    MPI_HOME = EFA_HOME
    NPROC_PER_NODE = args.nproc_per_node

    if NPROC_PER_NODE > task0.num_gpus:
        raise ValueError(
            f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present"
        )

    NUM_GPUS = NPROC_PER_NODE * args.num_tasks

    config["NUM_GPUS"] = NUM_GPUS

    config["internal_id"] = u.get_account_number()
    config["internal_alias"] = u.get_account_name()
    config["region"] = u.get_region()
    config["zone"] = u.get_zone()
    config["launch_user"] = os.environ.get("USER", "")
    config["cmd"] = " ".join(sys.argv)
    config["launcher_conda"] = util.ossystem(
        'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}')
    config["launcher_cmd"] = "python " + " ".join(sys.argv)
    config["logdir"] = job.logdir

    pickled_config = util.text_pickle(config)
    if args.log_all_workers:
        job.write(args.internal_config_fn, pickled_config)
    else:
        job.tasks[0].write(args.internal_config_fn, pickled_config)

    if args.mount_imagenet:
        if not u.get_zone():
            raise ValueError("Must specify zone when reusing EBS volumes")

        mount_imagenet(job)

    if not args.skip_setup:
        job.run(
            "rm -f *.py")  # remove files backed into imagenet18 release image
        job.run("conda init")  # missing .bashrc
        job.run(
            f"{{ source activate {args.conda_env} && "
            f"bash setup.sh && pip install -U protobuf ; }}  && {{ killall python || echo hi ; }} "
        )
        if args.pytorch_nightly:
            job.run(
                "conda install -y -c pytorch pytorch-nightly && bash setup.sh")
    else:
        job.run([
            f"source ~/.bashrc && conda activate {args.conda_env}",
            f"killall python || echo hi"
        ])

    job.rsync(".")

    if args.efa:
        if "efa" not in args.image_name:
            raise ValueError("make sure we use EFA-enabled image")

        unused_hosts_str, hosts_file_str = util.setup_mpi(
            job, skip_ssh_setup=args.skip_setup)
        if not args.skip_setup:
            task0.write(HOSTS_SLOTS_FN, hosts_file_str)

    env_params = get_nccl_params(args.machines, args.nproc_per_node)
    if args.cuda_debug:
        env_params += "CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO "
    else:
        env_params += "NCCL_DEBUG=INFO "

    env_params += " OMP_NUM_THREADS=1 "
    if args.pytorch_use_spawn:
        if not args.pytorch_nightly:
            raise ValueError()

        env_params += " PYTORCH_USE_SPAWN=1 "
    if "WANDB_API_KEY" in os.environ:
        env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} "

    # Training script args
    default_params = [
        datadir,
        "--fp16",
        "--logdir",
        job.logdir,
        "--name",
        f"{args.run_name}-{util.random_id()}",
        "--distributed",
        "--init-bn0",
        "--no-bn-wd",
        "--log_all_workers",
        args.log_all_workers,
    ]

    params = ["--phases", util.text_pickle(schedules[args.machines])]
    training_params = default_params + params
    training_params = " ".join(map(format_params, training_params))

    if not args.efa:
        # TODO: simplify args processing, or give link to actual commands run
        for i, task in enumerate(job.tasks):
            dist_params = (
                f"--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} "
                f"--node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}"
            )
            cmd = (
                f"{env_params} python -m torch.distributed.launch {dist_params} "
                f"training/train_imagenet_nv.py {training_params}")
            task.run(
                f"echo {cmd} > {job.logdir}/task-{i}.cmd")  # save command-line
            task.run(cmd, non_blocking=True)
    else:
        FI_PROVIDER = "efa"
        if args.pseudo_efa:
            FI_PROVIDER = "sockets"

        local_env = util.format_env_export(
            LOCAL_RANK="$OMPI_COMM_WORLD_LOCAL_RANK",
            RANK="$OMPI_COMM_WORLD_RANK",
            WORLD_SIZE="$OMPI_COMM_WORLD_SIZE",
            MASTER_ADDR=task0.ip,
            MASTER_PORT=6016,
        )

        mpi_env = util.format_env_x(
            FI_PROVIDER=
            FI_PROVIDER,  # Enables running nccl-tests using EFA provider.
            FI_OFI_RXR_RX_COPY_UNEXP=
            1,  #  Disables using bounce buffers for unexpected messages.
            FI_OFI_RXR_RX_COPY_OOO=
            1,  # Disables using bounce buffers for out of order messages.
            FI_EFA_MR_CACHE_ENABLE=1,  # Enables memory region caching.
            FI_OFI_RXR_INLINE_MR_ENABLE=
            1,  # Enables inline memory registration of data buffers.
            NCCL_TREE_THRESHOLD=10 *
            4294967296,  # force tree for everything under 40GB
            LD_LIBRARY_PATH=
            f"{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64",
            NCCL_DEBUG="INFO",
            OMP_NUM_THREADS=1,
            WANDB_API_KEY=os.environ.get("WANDB_API_KEY", ""),
            PYTORCH_USE_SPAWN=args.pytorch_use_spawn,
            NO_WANDB=args.pytorch_use_spawn,
        )
        if args.no_op:
            worker_script_fn = "training/env_test.py"
        else:
            worker_script_fn = "training/train_imagenet_nv.py"

        local_cmd = [
            f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ",
            f"python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK",
        ]
        local_cmd = " ".join(local_cmd)

        cmd = [
            f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ",
            f"{mpi_env} ",
            f"--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ",
            f"--bind-to none ",
            f"bash -c '{local_cmd}'",
        ]
        cmd = " ".join(cmd)

        task0.run(cmd, non_blocking=True)

    print(f"Logging to {job.logdir}")