def train(env_id, num_timesteps, seed, num_cpu): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.session().__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json"%rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps /= 4 # because we're wrapping the envs to do frame skip env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(env_id, num_timesteps, seed, num_cpu): from baselines.pposgd import pposgd_simple, cnn_policy import baselines.common.tf_util as U whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.session().__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps /= 4 # because we're wrapping the envs to do frame skip env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env_id, num_timesteps, timesteps_per_batch, seed, num_cpu, resume, agentName, logdir, hid_size, num_hid_layers, clip_param, entcoeff, optim_epochs, optim_stepsize, optim_batchsize, gamma, lam, portnum, max_to_keep): from baselines.ppo1 import mlp_policy, pposgd_simple whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) utils.portnum = portnum + rank workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = gym.make(env_id) env.seed(seed) if logger.get_dir(): env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch, clip_param=clip_param, entcoeff=entcoeff, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, gamma=gamma, lam=lam, resume=resume, agentName=agentName, logdir=logdir, max_to_keep=max_to_keep) env.close()
def train(env_id, num_timesteps, seed, model_path, load_model, timesteps_per_batch, hidden_units, hidden_layers, trainmodel, ACTION, EMBEDDING, MODEL, LOGGING): whoami = mpi_fork(num_cpu) if whoami == "parent": return import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() workerseed = 2221438774 set_global_seeds(workerseed) env = gym.make(env_id) env = wrappers.ConfigWrapper(env, ACTION, EMBEDDING, MODEL, LOGGING) def policy_fn(name, ob_space, ac_space): return LSTMPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_units, num_hid_layers=hidden_layers) env.seed(workerseed) trpo_indi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=max_kl, cg_iters=cg_iters, cg_damping=cd_damping, max_episodes=num_timesteps, gamma=gamma, lam=lam, vf_iters=vf_iters, vf_stepsize=vf_stepsize, load_model=load_model, model_path=model_path, trainmodel=trainmodel) env.close()
def train(env_id, num_timesteps, seed): whoami = mpi_fork(num_cpu) if whoami == "parent": return import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) logger.configure() env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi_modified.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, alpha=100) env.close()
def train(env_id, num_timesteps, seed, num_cpu): from baselines.pposgd import pposgd_simple, cnn_policy import baselines.common.tf_util as U whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() logger.session().__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps /= 4 # because we're wrapping the envs to do frame skip env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed, model_name, model_path, para,load_model, timesteps_per_batch,hidden_units,hidden_layers): whoami = mpi_fork(num_cpu) if whoami == "parent": return import baselines.common.tf_util as U logger.session().__enter__() sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) env = SubsetWrapper(env, para) #env = gym_kidney.LogWrapper(env, NN, EXP, OUT, FREQ, PARAM) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=hidden_units, num_hid_layers=hidden_layers) env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank)) # env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_indi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=max_kl, cg_iters=cg_iters, cg_damping=cd_damping, max_episodes=num_timesteps, gamma=gamma, lam=lam, vf_iters=vf_iters, vf_stepsize=vf_stepsize, load_model=load_model, model_path=model_path ) env.close()
def main(): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('-n', '--exp_name', dest='exp_name', default='tmp') parser.add_argument('-r', '--render', dest='render', action='store_true') parser.add_argument('-c', '--num_cpu', dest='num_cpu', default=1, type=int) parser.add_argument('--resdir', dest='resdir', default='results') parser.add_argument('--max_timesteps', dest='max_timesteps', default=1e9, type=int) parser.add_argument('--seed', dest='seed', default=123, type=int) parser.add_argument('--force_override', dest='force_override', action='store_true') parser.add_argument('--timesteps_per_batch', dest='timesteps_per_batch', default=2048, type=int) parser.add_argument('--clip_param', dest='clip_param', default=0.2, type=float) parser.add_argument('--optim_epochs', dest='optim_epochs', default=10, type=int) parser.add_argument('--optim_stepsize', dest='optim_stepsize', default=3e-4, type=float) parser.add_argument('--optim_batchsize', dest='optim_batchsize', default=64, type=int) parser.add_argument('--entcoeff', dest='entcoeff', default=0., type=float) parser.add_argument('--gamma', dest='gamma', default=0.99, type=float) parser.add_argument('--lam', dest='lam', default=0.95, type=float) parser.add_argument('--hid_size', dest='hid_size', default=64, type=int) parser.add_argument('--num_hid_layers', dest='num_hid_layers', default=2, type=int) parser.add_argument('--shaping', dest='shaping', default=None, type=str) parser.add_argument('--save_every', dest='save_every', default=20, type=int) parser.add_argument('--diff', dest='diff', default=0, type=int) parser.add_argument('--relative_x', dest='relative_x', action='store_true', help='DEPRECATED') parser.add_argument('--transform_inputs', dest='transform_inputs', type=str, default=None) parser.add_argument('--bound_by_sigmoid', dest='bound_by_sigmoid', action='store_true') parser.add_argument('--sigmoid_coef', dest='sigmoid_coef', default=1., type=float) parser.add_argument('--noobsthack', dest='noobsthack', action='store_true') parser.add_argument('--nogaussian_fixed_var', dest='nogaussian_fixed_var', action='store_true') parser.add_argument('--activation', dest='activation', default='tanh', type=str) parser.add_argument('--nonormalize_obs', dest='nonormalize_obs', action='store_true') parser.add_argument('--nostochastic', dest='nostochastic', action='store_true') parser.add_argument('--nostochastic2', dest='nostochastic2', action='store_true') parser.add_argument('--load_model', dest='load_model', default=None, type=str) parser.add_argument('--test_only', dest='test_only', action='store_true') parser.add_argument('--evaluate', dest='evaluate', action='store_true') parser.add_argument('--n_eval_episodes', dest='n_eval_episodes', default=10000, type=int) parser.add_argument('--submit', dest='submit', action='store_true') parser.add_argument('--max_env_steps', dest='max_env_steps', default=1000, type=int) parser.add_argument('--run_logs_dir', dest='run_logs_dir', default=None, type=str) parser.add_argument('--avg_norm_symmetry', dest='avg_norm_symmetry', action='store_true') parser.add_argument('--symmetric_interpretation', dest='symmetric_interpretation', action='store_true') parser.add_argument('--stdclip', dest='stdclip', default=5.0, type=float) parser.add_argument('--memory_size', dest='memory_size', default=1, type=int) parser.add_argument('--swap_legs_mode', dest='swap_legs_mode', default=None, type=str) parser.add_argument('--filter_obs', dest='filter_obs', action='store_true') parser.add_argument('--actions', dest='actions', default='gaussian', type=str) parser.add_argument('--binary_actions', dest='binary_actions', action='store_true', help='deprecated') parser.add_argument('--beta_dist', dest='beta_dist', action='store_true', help='deprecated') parser.add_argument('--gaussian_bias', dest='gaussian_bias', action='store_true') parser.add_argument('--muscles', dest='muscles', action='store_true') parser.add_argument('--repeats', dest='repeats', default=1, type=int) parser.add_argument('--add_time', dest='add_time', action='store_true') parser.add_argument('--simwalker', dest='simwalker', action='store_true') parser.add_argument('--log_walker', dest='log_walker', action='store_true') parser.add_argument('--log_simwalker', dest='log_simwalker', action='store_true') parser.add_argument('--symmetric_training', dest='symmetric_training', action='store_true') parser.add_argument('--step_timeout', dest='step_timeout', default=None, type=float) parser.add_argument('--gaussian_from_binary', dest='gaussian_from_binary', action='store_true') parser.add_argument('--pv', dest='parallel_value', action='store_true') parser.add_argument('--pv_layers', dest='pv_layers', default=2, type=int) parser.add_argument('--pv_hid_size', dest='pv_hid_size', default=512, type=int) parser.add_argument('--horizon_hack', dest='horizon_hack', action='store_true') parser.add_argument('--single_episode', dest='single_episode', action='store_true') parser.add_argument('--n_obstacles', dest='n_obstacles', default=3, type=int) parser.add_argument('--nologs', dest='nologs', action='store_true') parser.add_argument('--init_three', dest='init_three', action='store_true') parser.add_argument('--three', dest='three', action='store_true') parser.add_argument('--pause', dest='pause', action='store_true') parser.add_argument('--nobind', dest='nobind', action='store_true') parser.add_argument('--running_avg_len', dest='running_avg_len', default=100, type=int) parser.add_argument('--submit_token', dest='submit_token', default=None, type=str) parser.add_argument('--fall_penalty', dest='fall_penalty', action='store_true') parser.add_argument('--fall_penalty_val', dest='fall_penalty_val', default=2., type=float) parser.add_argument('--higher_pelvis', dest='higher_pelvis', default=0.65, type=float) parser.add_argument('--print_action', dest='print_action', action='store_true') parser.add_argument('--new8_fix', dest='new8_fix', action='store_true') parser.add_argument('--symmetric_training_trick', dest='symmetric_training_trick', action='store_true') parser.add_argument('--submit_round2', dest='submit_round2', action='store_true') parser.add_argument('--noisy_obstacles', dest='noisy_obstacles', action='store_true') parser.add_argument('--noisy_obstacles2', dest='noisy_obstacles2', action='store_true') parser.add_argument('--execute_just', dest='execute_just', default=None, type=int) parser.add_argument('--seeds_fn', dest='seeds_fn', default=None, type=str) parser.add_argument('--bootstrap_seeds', dest='bootstrap_seeds', action='store_true') parser.add_argument('--noisy_fix', dest='noisy_fix', action='store_true') args = parser.parse_args() if args.transform_inputs in [ 'new_5', 'new_6', 'new_7', 'new_8', 'new_9', 'new_a', 'new_8b' ]: args.filter_obs = True if args.binary_actions: logger.warn('Deprecated option') args.actions = 'binary' if args.beta_dist: logger.warn('Deprecated option') args.actions = 'beta' if args.relative_x: assert args.transform_inputs is None args.transform_inputs = 'relative_x' if args.transform_inputs == 'new_4': logger.warn("Overriding the memory size to 3") args.memory_size = 3 if args.submit: assert args.load_model args.evaluate = True if args.submit_round2: assert args.load_model args.evaluate = True args.n_eval_episodes = 100000 args.log_simwalker = False args.log_walker = False args.nobind = True args.num_cpu = 1 args.nologs = True if args.render: args.num_cpu = 1 # Create exp dir env_name = f'Walker_d{args.diff}' if args.max_env_steps is not None and args.max_env_steps != 1000: env_name += f'_{args.max_env_steps:03d}' if args.n_obstacles != 3: env_name += f'_o{args.n_obstacles:02d}' env_name += '-v0' args.exp_path = path.join(args.resdir, env_name, 'PPOOAI', args.exp_name, str(args.seed)) if args.run_logs_dir is None and not args.test_only and not args.evaluate: args.run_logs_dir = path.join(args.exp_path, 'run_logs') if args.nologs: args.run_logs_dir = None whoami = mpi_fork(args.num_cpu, not args.nobind) if whoami == 'parent': return if MPI.COMM_WORLD.Get_rank() == 0: if not args.test_only and not args.evaluate: prepare_env(args) else: time.sleep(0.5) # Just in case train(args)
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(args): # print("\n\n\n\n\nXXX") # print(sys.path) # import baselines # print(baselines.__file__()) # for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']: # if varname in os.environ: # print(varname, int(os.environ[varname])) # print("parsing args...") arg_parser = init_arg_parser() args, unknown_args = arg_parser.parse_known_args(args) # if args.num_cpu > 1: if args.allow_run_as_root: whoami = mpi_fork_run_as_root(args.num_cpu, bind_to_core=args.bind_to_core) else: whoami = mpi_fork(args.num_cpu, bind_to_core=args.bind_to_core) if whoami == 'parent': print('parent exiting with code 0...') sys.exit(0) U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # assert MPI.COMM_WORLD.Get_size() == args.num_cpu, MPI.COMM_WORLD.Get_size() # configure logger # rank = MPI.COMM_WORLD.Get_rank() # FIXME: how to log when rank != 0?? # if rank == 0: configure_logger(args.log_path, format_strs=[]) logger.info(f"main: {rank} / {MPI.COMM_WORLD.Get_size()}") logger.info(f"logger dir: {logger.get_dir()}") extra_args = parse_cmdline_kwargs(unknown_args) logger.info(args, extra_args) # else: # configure_logger(log_path=None) # or still args.log_path? # raise RuntimeError(f"tf session: {tf.get_default_session()}, {MPI.COMM_WORLD.Get_rank()} / {MPI.COMM_WORLD.Get_size()}") def make_wrapped_env(): env = gym.make(args.env) if args.env_type == 'maze': pass elif args.env_type == 'robotics': from baselines.envs.goal_sampler_env_wrapper import GoalSamplerEnvWrapper env = GoalSamplerEnvWrapper(env) elif args.env_type == 'ant': env = GoalExplorationEnv(env=env, only_feasible=True, extend_dist_rew=0, inner_weight=0, goal_weight=1) else: raise NotImplementedError(args.env_type) # FIXME: if resample space is feasible, can set only_feasible = False to avoid unnecessary computation return env venv_kwargs = dict( make_wrapped_env=make_wrapped_env, seed=args.seed, reward_scale=args.reward_scale, flatten_dict_observations=False, mpi_rank=rank, monitor_log_dir=args.log_path, # FIXME ) venv = make_vec_env(num_env=args.num_env, **venv_kwargs) eval_venv = make_vec_env(num_env=args.num_env, **venv_kwargs) if args.debug: plotter_venv = make_vec_env(num_env=1, **venv_kwargs) else: plotter_venv = None # Seed everything. rank_seed = args.seed + 1000000 * rank if args.seed is not None else None set_global_seeds(rank_seed) logger.info(f'setting global rank: {rank_seed} ') # Prepare params. params = dict() params.update(config.DEFAULT_PARAMS) params.update(config.DEFAULT_ENV_PARAMS[args.env]) params.update(**extra_args) # makes it possible to override any parameter # if args.debug: # params['n_cycles'] = 2 # params['n_batches'] = 2 # params['ve_n_batches'] = 2 # params['size_ensemble'] = 2 # env settings params['env_name'] = args.env params['num_cpu'] = args.num_cpu params['rollout_batch_size'] = args.num_env params['timesteps_per_cpu'] = int(args.num_timesteps) with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params['make_env'] = make_wrapped_env learn_fun_return = learn( venv=venv, eval_venv=eval_venv, plotter_venv=plotter_venv, params=params, save_path=args.log_path, save_interval=args.save_interval, ) if rank == 0: save_path = os.path.expanduser(logger.get_dir()) for k, v in learn_fun_return.items(): v.save(os.path.join(save_path, f"final-{k}.joblib")) venv.close() eval_venv.close() if plotter_venv is not None: plotter_venv.close()
def learn(*, network, env, total_timesteps, num_cpu, allow_run_as_root, seed=None, eval_env=None, replay_strategy='future', save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs): rank = MPI.COMM_WORLD.Get_rank() logger.info('before mpi_fork: rank', rank, 'num_cpu', MPI.COMM_WORLD.Get_size()) if num_cpu > 1: if allow_run_as_root: whoami = mpi_fork_run_as_root(num_cpu) else: whoami = mpi_fork(num_cpu) if whoami == 'parent': logger.info('parent exiting with code 0...') sys.exit(0) U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() logger.info('after mpi_fork: rank', rank, 'num_cpu', num_cpu) override_params = override_params or {} # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.spec.id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter params['rollout_batch_size'] = env.num_envs params['num_cpu'] = num_cpu with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size logger.info("actual total timesteps : {}".format( n_epochs * n_cycles * rollout_worker.T * rollout_worker.rollout_batch_size)) return train(save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], save_interval=save_interval, demo_file=demo_file)
def train(env_id, num_timesteps, timesteps_per_batch, seed, num_cpu, resume, agentName, logdir, hid_size, num_hid_layers, noisy_nets, clip_param, entcoeff, optim_epochs, optim_batchsize, optim_stepsize, optim_schedule, desired_kl, gamma, lam, portnum, num_parallel): from baselines.ppo1 import mlp_policy, pposgd_parallel print("num cpu = " + str(num_cpu)) if (num_cpu > 1) and (num_parallel > 1): print( "num_cpu > 1 and num_parallel > 0 can't be used together at the moment!" ) exit(0) whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) utils.portnum = portnum + rank workerseed = seed + 10000 * rank if utils.server_list != "": servers = utils.server_list.split(",") num_thread = utils.num_thread_list.split(",") tmp = 0 a = 0 snum = -1 num_total = 0 for t in num_thread: num_total += int(t) for t in num_thread: if rank < tmp + int(t): snum = a break tmp += int(t) a += 1 if num_total != num_cpu: print("Sum of num_thread_list must be equal to num_cpu") quit() print("Connect to tcp://" + servers[snum] + ":" + str(utils.portnum)) utils.server_ip = servers[snum] set_global_seeds(workerseed) if num_parallel > 1: env = CustomParallelEnv(num_parallel) else: env = gym.make(env_id) env.seed(seed) if logger.get_dir(): if num_parallel <= 1: env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) def policy_fn(name, ob_space, ac_space, noisy_nets=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, noisy_nets=noisy_nets) gym.logger.setLevel(logging.WARN) pposgd_parallel.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch, clip_param=clip_param, entcoeff=entcoeff, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, schedule=optim_schedule, desired_kl=desired_kl, gamma=gamma, lam=lam, resume=resume, noisy_nets=noisy_nets, agentName=agentName, logdir=logdir, num_parallel=num_parallel, num_cpu=num_cpu) if num_parallel <= 1: env.close()
def learn( *, env_type, env, eval_env, plotter_env, total_timesteps, num_cpu, allow_run_as_root, bind_to_core, seed=None, save_interval=5, clip_return=True, override_params=None, load_path=None, save_path=None, policy_pkl=None, ): rank = MPI.COMM_WORLD.Get_rank() logger.info('before mpi_fork: rank', rank, 'num_cpu', MPI.COMM_WORLD.Get_size()) if num_cpu > 1: if allow_run_as_root: whoami = mpi_fork_run_as_root(num_cpu, bind_to_core=bind_to_core) else: whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': logger.info('parent exiting with code 0...') sys.exit(0) U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() logger.info('after mpi_fork: rank', rank, 'num_cpu', num_cpu) override_params = override_params or {} # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.spec.id params['env_name'] = env_name if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter params['rollout_batch_size'] = env.num_envs params['num_cpu'] = num_cpu params['env_type'] = env_type with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_ve_params(params) dims = config.configure_dims(params) policy, value_ensemble, sample_disagreement_goals_fun, sample_uniform_goals_fun = \ config.configure_ve_ddpg(dims=dims, params=params, clip_return=clip_return, policy_pkl=policy_pkl) if policy_pkl is not None: env.set_sample_goals_fun(sample_dummy_goals_fun) else: env.envs_op("update_goal_sampler", goal_sampler=sample_disagreement_goals_fun) eval_env.envs_op("update_goal_sampler", goal_sampler=sample_uniform_goals_fun) if plotter_env is not None: plotter_env.envs_op("update_goal_sampler", goal_sampler=sample_uniform_goals_fun) if load_path is not None: tf_util.load_variables( os.path.join(load_path, 'final_policy_params.joblib')) return play(env=env, policy=policy) rollout_params, eval_params, plotter_params = config.configure_rollout_worker_params( params) rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size params['n_epochs'] = n_epochs params[ 'total_timesteps'] = n_epochs * n_cycles * rollout_worker.T * rollout_worker.rollout_batch_size config.log_params(params, logger=logger) if policy_pkl is not None: train_fun = train_ve evaluator = None else: train_fun = train # construct evaluator # assert eval_env.sample_goals_fun is None # eval_env.set_sample_goals_fun(sample_dummy_goals_fun) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) if plotter_env is not None: raise NotImplementedError # from baselines.misc.html_report import HTMLReport # plotter_worker = RolloutWorker(plotter_env, policy, dims, logger, **plotter_params) # rank = MPI.COMM_WORLD.Get_rank() # report = HTMLReport(os.path.join(save_path, f'report-{rank}.html'), images_per_row=8) # # # report.add_header("{}".format(EXPERIMENT_TYPE)) # # report.add_text(format_dict(v)) # plotter = config.configure_plotter(policy, value_ensemble, plotter_worker, params, report) else: plotter = None return train_fun(save_path=save_path, policy=policy, value_ensemble=value_ensemble, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], ve_n_batches=params['ve_n_batches'], save_interval=save_interval, plotter=plotter)
print('Saving model ' + args.model) saver = tf.train.Saver() saver.save(session, args.model) print('Saved model ' + args.model + ' at ' + time()) def on_iteration_start(local_vars, global_vars): on_iteration_start.iteration += 1 load_model(on_iteration_start.iteration) plot_history(local_vars['history'], on_iteration_start.iteration) save_model(on_iteration_start.iteration) on_iteration_start.iteration = 0 whoami = mpi_fork(args.cores) if whoami == 'parent': exit(0) session = U.single_threaded_session() session.__enter__() logger.session().__enter__() env = RunEnv(args.visualize, max_obstacles=args.obstacles, original_reward=args.original) env.spec.timestep_limit = args.max_steps if args.visualize: vis = env.osim_model.model.updVisualizer().updSimbodyVisualizer() vis.setBackgroundType(vis.GroundAndSky) vis.setShowFrameNumber(True) vis.zoomCameraToShowAllGeometry() vis.setCameraFieldOfView(1)