def main(args): """ Run local training. :param args: Dict[str, Any] :return: """ args, log_id_dir, initial_step, logger = Init.main(MODE, args) R.save_extern_classes(log_id_dir) container = Local(args, logger, log_id_dir, initial_step) if args.profile: try: from pyinstrument import Profiler except: raise ImportError( 'You must install pyinstrument to use profiling.') container.nb_step = 10e3 profiler = Profiler() profiler.start() try: container.run() finally: if args.profile: profiler.stop() print(profiler.output_text(unicode=True, color=True)) container.close() if args.eval: from adept.scripts.evaluate import main eval_args = { 'log_id_dir': log_id_dir, 'gpu_id': 0, 'nb_episode': 30, } if args.custom_network: eval_args['custom_network'] = args.custom_network main(eval_args)
def main(args): """ Run distributed training. :param args: Dict[str, Any] :return: """ args = DotDict(args) dist_world_size = args.nb_proc * args.nb_node current_env = os.environ.copy() current_env["MASTER_ADDR"] = args.master_addr current_env["MASTER_PORT"] = str(args.master_port) current_env["WORLD_SIZE"] = str(dist_world_size) args, log_id_dir, initial_step, logger = Init.main(MODE, args) R.save_extern_classes(log_id_dir) processes = [] for local_rank in range(0, args.nb_proc): # each process's rank dist_rank = args.nb_proc * args.node_rank + local_rank current_env["RANK"] = str(dist_rank) current_env["LOCAL_RANK"] = str(local_rank) # spawn the processes if not args.resume: cmd = [ sys.executable, "-u", "-m", "adept.scripts._distrib", "--log-id-dir={}".format(log_id_dir), ] else: cmd = [ sys.executable, "-u", "-m", "adept.scripts._distrib", "--log-id-dir={}".format(log_id_dir), "--resume={}".format(True), "--load-network={}".format(args.load_network), "--load-optim={}".format(args.load_optim), "--initial-step-count={}".format(initial_step), "--init-method={}".format(args.init_method), ] if args.custom_network: cmd += ["--custom-network", args.custom_network] process = subprocess.Popen(cmd, env=current_env) processes.append(process) for process in processes: process.wait() if args.eval: from adept.scripts.evaluate import main eval_args = { "log_id_dir": log_id_dir, "gpu_id": 0, "nb_episode": 30, } if args.custom_network: eval_args["custom_network"] = args.custom_network main(eval_args)
"--resume={}".format(True), "--load-network={}".format(args.load_network), "--load-optim={}".format(args.load_optim), "--initial-step-count={}".format(initial_step), "--init-method={}".format(args.init_method), ] if args.custom_network: cmd += ["--custom-network", args.custom_network] process = subprocess.Popen(cmd, env=current_env) processes.append(process) for process in processes: process.wait() if args.eval: from adept.scripts.evaluate import main eval_args = { "log_id_dir": log_id_dir, "gpu_id": 0, "nb_episode": 30, } if args.custom_network: eval_args["custom_network"] = args.custom_network main(eval_args) if __name__ == "__main__": main(parse_args())
def main(args): """ Run actorlearner training. :param args: Dict[str, Any] :return: """ args, log_id_dir, initial_step, logger = Init.main(MODE, args) R.save_extern_classes(log_id_dir) # start ray if args.ray_addr is not None: ray.init(address=args.ray_addr) logger.info('Using Ray on a cluster. Head node address: {}'.format( args.ray_addr)) else: logger.info('Using Ray on a single machine.') ray.init() # create a main learner which logs summaries and saves weights main_learner_cls = ActorLearnerHost.as_remote( num_cpus=args.learner_cpu_alloc, num_gpus=args.learner_gpu_alloc) main_learner = main_learner_cls.remote(args, log_id_dir, initial_step, rank=0) # if multiple learners setup nccl if args.nb_learners > 1: # create N peer learners peer_learners = [] for p_ind in range(args.nb_learners - 1): remote_cls = ActorLearnerHost.as_remote( num_cpus=args.learner_cpu_alloc, num_gpus=args.learner_gpu_alloc) # init remote = remote_cls.remote(args, log_id_dir, initial_step, rank=p_ind + 1) peer_learners.append(remote) # figure out main learner node ip nccl_addr, nccl_ip, nccl_port = ray.get( main_learner._rank0_nccl_port_init.remote()) # setup all nccls nccl_inits = [ main_learner._nccl_init.remote(nccl_addr, nccl_ip, nccl_port) ] nccl_inits.extend([ p._nccl_init.remote(nccl_addr, nccl_ip, nccl_port) for p in peer_learners ]) # wait for all ray.get(nccl_inits) logger.info('NCCL initialized') # have all sync parameters [f._sync_peer_parameters.remote() for f in peer_learners] main_learner._sync_peer_parameters.remote() # else just 1 learner else: peer_learners = [] # create workers workers = [ ActorLearnerWorker.as_remote(num_cpus=args.worker_cpu_alloc, num_gpus=args.worker_gpu_alloc).remote( args, log_id_dir, initial_step, w_ind) for w_ind in range(args.nb_workers) ] # synchronize worker variables ray.get( main_learner.synchronize_worker_parameters.remote(workers, initial_step, blocking=True)) try: # startup the run method of all containers runs = [main_learner.run.remote(workers, args.profile)] runs.extend([f.run.remote(workers) for f in peer_learners]) done_training = ray.wait(runs) finally: closes = [main_learner.close.remote()] closes.extend([f.close.remote() for f in peer_learners]) done_closing = ray.wait(closes) if args.eval: from adept.scripts.evaluate import main eval_args = { 'log_id_dir': log_id_dir, 'gpu_id': 0, 'nb_episode': 30, } if args.custom_network: eval_args['custom_network'] = args.custom_network main(eval_args)