예제 #1
0
def main(args):
    """
    Run local training.
    :param args: Dict[str, Any]
    :return:
    """
    args, log_id_dir, initial_step, logger = Init.main(MODE, args)
    R.save_extern_classes(log_id_dir)

    container = Local(args, logger, log_id_dir, initial_step)

    if args.profile:
        try:
            from pyinstrument import Profiler
        except:
            raise ImportError(
                'You must install pyinstrument to use profiling.')
        container.nb_step = 10e3
        profiler = Profiler()
        profiler.start()

    try:
        container.run()
    finally:
        if args.profile:
            profiler.stop()
            print(profiler.output_text(unicode=True, color=True))
        container.close()

    if args.eval:
        from adept.scripts.evaluate import main
        eval_args = {
            'log_id_dir': log_id_dir,
            'gpu_id': 0,
            'nb_episode': 30,
        }
        if args.custom_network:
            eval_args['custom_network'] = args.custom_network
        main(eval_args)
예제 #2
0
def main(args):
    """
    Run distributed training.
    :param args: Dict[str, Any]
    :return:
    """
    args = DotDict(args)

    dist_world_size = args.nb_proc * args.nb_node

    current_env = os.environ.copy()
    current_env["MASTER_ADDR"] = args.master_addr
    current_env["MASTER_PORT"] = str(args.master_port)
    current_env["WORLD_SIZE"] = str(dist_world_size)

    args, log_id_dir, initial_step, logger = Init.main(MODE, args)
    R.save_extern_classes(log_id_dir)

    processes = []

    for local_rank in range(0, args.nb_proc):
        # each process's rank
        dist_rank = args.nb_proc * args.node_rank + local_rank
        current_env["RANK"] = str(dist_rank)
        current_env["LOCAL_RANK"] = str(local_rank)

        # spawn the processes
        if not args.resume:
            cmd = [
                sys.executable,
                "-u",
                "-m",
                "adept.scripts._distrib",
                "--log-id-dir={}".format(log_id_dir),
            ]
        else:
            cmd = [
                sys.executable,
                "-u",
                "-m",
                "adept.scripts._distrib",
                "--log-id-dir={}".format(log_id_dir),
                "--resume={}".format(True),
                "--load-network={}".format(args.load_network),
                "--load-optim={}".format(args.load_optim),
                "--initial-step-count={}".format(initial_step),
                "--init-method={}".format(args.init_method),
            ]
        if args.custom_network:
            cmd += ["--custom-network", args.custom_network]

        process = subprocess.Popen(cmd, env=current_env)
        processes.append(process)

    for process in processes:
        process.wait()

    if args.eval:
        from adept.scripts.evaluate import main

        eval_args = {
            "log_id_dir": log_id_dir,
            "gpu_id": 0,
            "nb_episode": 30,
        }
        if args.custom_network:
            eval_args["custom_network"] = args.custom_network
        main(eval_args)
예제 #3
0
                "--resume={}".format(True),
                "--load-network={}".format(args.load_network),
                "--load-optim={}".format(args.load_optim),
                "--initial-step-count={}".format(initial_step),
                "--init-method={}".format(args.init_method),
            ]
        if args.custom_network:
            cmd += ["--custom-network", args.custom_network]

        process = subprocess.Popen(cmd, env=current_env)
        processes.append(process)

    for process in processes:
        process.wait()

    if args.eval:
        from adept.scripts.evaluate import main

        eval_args = {
            "log_id_dir": log_id_dir,
            "gpu_id": 0,
            "nb_episode": 30,
        }
        if args.custom_network:
            eval_args["custom_network"] = args.custom_network
        main(eval_args)


if __name__ == "__main__":
    main(parse_args())
예제 #4
0
def main(args):
    """
    Run actorlearner training.
    :param args: Dict[str, Any]
    :return:
    """
    args, log_id_dir, initial_step, logger = Init.main(MODE, args)
    R.save_extern_classes(log_id_dir)

    # start ray
    if args.ray_addr is not None:
        ray.init(address=args.ray_addr)
        logger.info('Using Ray on a cluster. Head node address: {}'.format(
            args.ray_addr))
    else:
        logger.info('Using Ray on a single machine.')
        ray.init()

    # create a main learner which logs summaries and saves weights
    main_learner_cls = ActorLearnerHost.as_remote(
        num_cpus=args.learner_cpu_alloc, num_gpus=args.learner_gpu_alloc)
    main_learner = main_learner_cls.remote(args,
                                           log_id_dir,
                                           initial_step,
                                           rank=0)

    # if multiple learners setup nccl
    if args.nb_learners > 1:
        # create N peer learners
        peer_learners = []
        for p_ind in range(args.nb_learners - 1):
            remote_cls = ActorLearnerHost.as_remote(
                num_cpus=args.learner_cpu_alloc,
                num_gpus=args.learner_gpu_alloc)
            # init
            remote = remote_cls.remote(args,
                                       log_id_dir,
                                       initial_step,
                                       rank=p_ind + 1)
            peer_learners.append(remote)

        # figure out main learner node ip
        nccl_addr, nccl_ip, nccl_port = ray.get(
            main_learner._rank0_nccl_port_init.remote())

        # setup all nccls
        nccl_inits = [
            main_learner._nccl_init.remote(nccl_addr, nccl_ip, nccl_port)
        ]
        nccl_inits.extend([
            p._nccl_init.remote(nccl_addr, nccl_ip, nccl_port)
            for p in peer_learners
        ])
        # wait for all
        ray.get(nccl_inits)
        logger.info('NCCL initialized')

        # have all sync parameters
        [f._sync_peer_parameters.remote() for f in peer_learners]
        main_learner._sync_peer_parameters.remote()
    # else just 1 learner
    else:
        peer_learners = []

    # create workers
    workers = [
        ActorLearnerWorker.as_remote(num_cpus=args.worker_cpu_alloc,
                                     num_gpus=args.worker_gpu_alloc).remote(
                                         args, log_id_dir, initial_step, w_ind)
        for w_ind in range(args.nb_workers)
    ]

    # synchronize worker variables
    ray.get(
        main_learner.synchronize_worker_parameters.remote(workers,
                                                          initial_step,
                                                          blocking=True))

    try:
        # startup the run method of all containers
        runs = [main_learner.run.remote(workers, args.profile)]
        runs.extend([f.run.remote(workers) for f in peer_learners])
        done_training = ray.wait(runs)

    finally:
        closes = [main_learner.close.remote()]
        closes.extend([f.close.remote() for f in peer_learners])
        done_closing = ray.wait(closes)

    if args.eval:
        from adept.scripts.evaluate import main
        eval_args = {
            'log_id_dir': log_id_dir,
            'gpu_id': 0,
            'nb_episode': 30,
        }
        if args.custom_network:
            eval_args['custom_network'] = args.custom_network
        main(eval_args)