def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def main(): logger.configure() env = make_atari('PongNoFrameskip-v4') env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.learn( env, "conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, lr=1e-4, total_timesteps=int(1e7), buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, ) model.save('pong_model.pkl') env.close()
def main(): parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') args = parser.parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy)
def main(): parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') args = parser.parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lrschedule=args.lrschedule, num_env=16)
def main(): args = mujoco_arg_parser().parse_args() logger.configure() model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) if args.play: logger.log("Running trained model") obs = np.zeros((env.num_envs,) + env.observation_space.shape) obs[:] = env.reset() while True: actions = model.step(obs)[0] obs[:] = env.step(actions)[0] env.render()
def main(args): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args(args) extra_args = parse_cmdline_kwargs(unknown_args) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure() else: logger.configure(format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, env = train(args, extra_args) if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("Running trained model") obs = env.reset() state = model.initial_state if hasattr(model, 'initial_state') else None dones = np.zeros((1,)) episode_rew = 0 while True: if state is not None: actions, _, state, _ = model.step(obs,S=state, M=dones) else: actions, _, _, _ = model.step(obs) obs, rew, done, _ = env.step(actions) episode_rew += rew[0] if isinstance(env, VecEnv) else rew env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: print('episode_rew={}'.format(episode_rew)) episode_rew = 0 obs = env.reset() env.close() return model
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) env.close()
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() if done: ob = env.reset()
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, save_path=None): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) logger.configure(dir=save_path) # logger save dir config. if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: print('load model from ' + load_path) model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos, max_rewards, mean_rewards, median_rewards = runner.run() #pylint: disable=E0632 ''' # array to img from PIL import Image i = 0 for ob in obs: for j in range(4): o = ob[:, :, j*3:j*3+3] img = Image.fromarray(o) img.save('input_img_' + str(i) + '.png') i += 1 ''' epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('max_rewards', max_rewards) logger.logkv('mean_rewards', mean_rewards) logger.logkv('median_rewards', median_rewards) # logger.logkv('env_stage', env.envs[0].env.env.env.env.env.env.statename) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) env.close() return model
env = gym.make(alg_kwargs['env_name']) env.set_episode_size(alg_kwargs['nsteps']) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir()), allow_early_resets=True) return env # Get dictionary from baselines/acktr/defaults alg_kwargs = defaults.mara_mlp() # Create needed folders timedate = datetime.now().strftime('%Y-%m-%d_%Hh%Mmin') logdir = '/tmp/ros2learn/' + alg_kwargs['env_name'] + '/acktr/' + timedate # Generate tensorboard file format_strs = os.getenv('MARA_LOG_FORMAT', 'stdout,log,csv,tensorboard').split(',') logger.configure(os.path.abspath(logdir), format_strs) with open(logger.get_dir() + "/parameters.txt", 'w') as out: out.write( 'num_layers = ' + str(alg_kwargs['num_layers']) + '\n' + 'num_hidden = ' + str(alg_kwargs['num_hidden']) + '\n' + 'layer_norm = ' + str(alg_kwargs['layer_norm']) + '\n' + 'nsteps = ' + str(alg_kwargs['nsteps']) + '\n' + 'nprocs = ' + str(alg_kwargs['nprocs']) + '\n' + 'gamma = ' + str(alg_kwargs['gamma']) + '\n' + 'lam = ' + str(alg_kwargs['lam']) + '\n' + 'ent_coef = ' + str(alg_kwargs['ent_coef']) + '\n' + 'vf_coef = ' + str(alg_kwargs['vf_coef']) + '\n' + 'vf_fisher_coef = ' + str(alg_kwargs['vf_fisher_coef']) + '\n' + 'lr = ' + str(alg_kwargs['lr']) + '\n' + 'max_grad_norm = ' + str(alg_kwargs['max_grad_norm']) + '\n'
gamma = 0.995 env = RemoteVecEnv([make_env] * num_cpus) env = VecNormalize(env, ret=True, gamma=gamma) set_global_seeds(seed) policy = policies.MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=num_cpus-num_casks, lam=0.95, gamma=gamma, noptepochs=4, log_interval=1, vf_coef=0.5, ent_coef=0.0, lr=3e-4, cliprange=0.2, save_interval=2, load_path="./logs/course_6/00244", total_timesteps=num_timesteps, num_casks=num_casks) if __name__ == "__main__": ray.init() configure(dir="./logs") train(int(1e6), 60730)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def train(env_id, num_timesteps, seed, render): env = LearningEnvironment(num_particles=PARTICLES, disable_render=not render) env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=8000, desired_kl=0.0002, num_timesteps=num_timesteps, animate=False) env.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Run Mujoco benchmark.') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1") parser.add_argument('--render', help='Choose whether to render', type=bool, default=False) args = parser.parse_args() logger.configure(dir=DIRECTORY) train(args.env, num_timesteps=5e7, seed=args.seed, render=args.render)
def main(): expdir = os.path.join("/home/wulfebw/experiments", "ssb64_005", "run_003") os.makedirs(expdir, exist_ok=True) monitor_filepath = os.path.join(expdir, "monitor.csv") movie_dir = os.path.join(expdir, "movies") os.makedirs(movie_dir, exist_ok=True) load_filepath = None # load_filepath = "/home/wulfebw/experiments/ssb64_004/run_006/checkpoints/00100" # This configures baselines logging. configure(dir=expdir) # Creating the session here prevents tf from using all the gpu memory, which # causes a failure in the emulator. I'm not sure why because when the emulator # is running with angrylion I thought it wasn't using any gpu memory, but # there's a lot here I don't understand so oh well. # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, gpu_options=gpu_options)) def make_env(rank, grayscale=True): retro.data.add_custom_integration("custom") env = retro.n64_env.N64Env(game="SuperSmashBros-N64", use_restricted_actions=retro.Actions.MULTI_DISCRETE, inttype=retro.data.Integrations.CUSTOM, obs_type=retro.Observations.IMAGE) env = wrap_n64(env, grayscale=grayscale) env = wrap_monitoring_n64(env, monitor_filepath=monitor_filepath, movie_dir=movie_dir) return env def make_vec_env(nenvs=4, recurrent=False, grayscale=True, frame_stack=4, frame_diff=False): venv = SubprocVecEnv([lambda: make_env(rank, grayscale=grayscale) for rank in range(nenvs)]) # Uncomment this line in place of the one above for debugging. # venv = DummyVecEnv([lambda: make_env(0)]) if not recurrent: if frame_diff: venv = VecFrameDiff(venv) else: # Perform the frame stack at the vectorized environment level as opposed to at # the individual environment level. I think this allows you to communicate fewer # images across processes. venv = VecFrameStack(venv, frame_stack) return venv network_name = "impala_cnn" recurrent = "lstm" in network_name grayscale = False frame_stack = 2 frame_diff = False venv = make_vec_env(nenvs=16, recurrent=recurrent, grayscale=grayscale, frame_stack=frame_stack, frame_diff=frame_diff) ppo2.learn(network=network_name, env=venv, total_timesteps=int(10e6), nsteps=256, nminibatches=8, lam=0.95, gamma=0.999, noptepochs=3, log_interval=1, ent_coef=.01, lr=lambda f: f * 5e-4, cliprange=0.2, save_interval=10, load_path=load_filepath)
def launch(n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") logdir = osp.join(osp.dirname(__file__), 'log/reach_her_%i_%s' % (seed, timestamp)) print("Logging to %s." % logdir) env = "SawyerPickAndPlace-v1" # Fork for multi-CPU MPI implementation. if num_cpu > 1: # try: # whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) # except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
) # choices are adaptive-param_xx, ou_xx, normal_xx, none parser.add_argument('--num-timesteps', type=int, default=None) boolean_flag(parser, 'evaluation', default=False) # add reward_param parser.add_argument('--reward_param_scaling', type=float, default=0.5) parser.add_argument('--reward_param_thr', type=float, default=50.) parser.add_argument('--reward_param_type', type=str, default='const') boolean_flag(parser, 'my_render', default=True) args = parser.parse_args() if args.num_timesteps is not None: assert (args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) dict_args = vars(args) del dict_args['num_timesteps'] return dict_args if __name__ == '__main__': args = parse_args() if MPI.COMM_WORLD.Get_rank() == 0: logger.configure() run(**args)
def run_task(vv, log_dir=None, exp_name=None): override_params = {} # Fork for multi-CPU MPI implementation. if vv['num_cpu'] > 1: whoami = mpi_fork(vv['num_cpu']) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() log_dir = '/media/part/cmu_ri/deep/deep_RL/data/local/square2d-debug/square2d_debug_2018_06_17/' #hack for now, fix later # Configure logging if rank == 0: if log_dir or logger.get_dir() is None: from pathlib import Path logger.configure(dir=log_dir, exp_name=exp_name) else: if log_dir or logger.get_dir() is None: from pathlib import Path logger.configure(dir=log_dir, exp_name=exp_name) logdir = logger.get_dir() #logdir = ''# a quick hack, fix later assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = vv['seed'] + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = vv['env_name'] params['replay_strategy'] = vv['replay_strategy'] params['replay_sample_strategy'] = vv['replay_sample_strategy'] params['reward_type'] = vv['reward_type'] params['replay_k'] = vv['replay_k'] if vv['network'] == 'fc': params['network_class'] = 'baselines.her.actor_critic:ActorCritic' elif vv['network'] == 'cnn_fc': params[ 'network_class'] = 'baselines.her.cnn_actor_critic:CNNActorCritic' if vv['env_name'] in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[ vv['env_name']]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'variant.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) shapes = config.configure_shapes(params) dims = shapes_to_dims(shapes) policy = config.configure_ddpg(dims=dims, shapes=shapes, params=params, clip_return=vv['clip_return']) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=vv['n_epochs'], n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=vv['policy_save_interval'], save_policies=vv['save_policies'])
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='SunblazeCartPoleRandomNormal-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--output', type=str) parser.add_argument('--processes', default=1, help='int or "max" for all') # EPOpt specific parser.add_argument('--epsilon', type=float, default=1.0) # EPOpt paper keept epsilon=1 until iters>100 (max 200 iters) parser.add_argument('--activate', type=int, default=100, help='How long to fix epsilon to 1.0 before e') parser.add_argument( '--paths', type=int, default=100, help='number of trajectories to sample from each iteration') parser.add_argument('--algorithm', type=str, choices=['ppo2', 'a2c'], default='ppo2', help='Inner batch policy optimization algorithm') parser.add_argument('--policy', choices=['mlp', 'lstm'], default='mlp', help='Policy architecture') # Episode-modification specific: # parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--total-episodes', type=int, default=5e4) # RL algo. yyperparameters parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--nsteps', type=int, default=2048) parser.add_argument('--ent-coef', type=float, default=1e-2, help='Only relevant for A2C') parser.add_argument('--nminibatches', type=int, default=32, help='Only relevant for PPO2') args = parser.parse_args() # Configure logger if args.output: try: os.makedirs(args.output) except OSError: pass logger.reset() logger.configure(dir=args.output) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.output, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) if args.processes == 'max': ncpu = multiprocessing.cpu_count() # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15 if sys.platform == 'darwin': ncpu //= 2 else: try: ncpu = int(args.processes) except ValueError: raise argparse.ArgumentTypeError("Invalid number of processes") train_epopt( args.env, total_episodes=args.total_episodes, seed=seed, lr=args.lr, epsilon=args.epsilon, activate_at=args.activate, paths=args.paths, algorithm=args.algorithm, policy=args.policy, ncpu=ncpu, nsteps=args.nsteps, nminibatches=args.nminibatches, ent_coef=args. ent_coef, # default 0.01 in baselines, 0.0001 in chainer A3C )
def main(): import neptune parser = argparse.ArgumentParser(argument_default=None) parser.add_argument('--config', action='append', help='Gin config files.') parser.add_argument('--debug', action='store_true', default=False) cmd_args, unknown = parser.parse_known_args() debug = cmd_args.debug spec_path = cmd_args.config[0] if not debug: try: with open(spec_path, 'rb') as f: import cloudpickle specification = cloudpickle.load(f) except pickle.UnpicklingError: with open(spec_path) as f: vars_ = {'script': os.path.basename(spec_path)} exec(f.read(), vars_) # pylint: disable=exec-used specification = vars_['experiments_list'][0].to_dict() print( 'NOTE: Only the first experiment from the list will be run!' ) parameters = specification['parameters'] else: print("debug run") parameters = dict(env_id="toy_mr", env_size=None) class MockArgs(object): def add(self, key, value): setattr(self, key, value) args = MockArgs() args.add('env', parameters["env_id"]) # 'chain_env' 'toy_mr' args.add('env_size', parameters["env_size"]) args.add('seed', 0) args.add('max_episode_steps', 300) args.add('num_timesteps', int(1e12)) args.add('num_env', 32) args.add('use_news', 0) args.add('gamma', 0.99) args.add('gamma_ext', 0.999) args.add('lam', 0.95) args.add('update_ob_stats_every_step', 0) args.add('update_ob_stats_independently_per_gpu', 0) args.add('update_ob_stats_from_random_agent', 1) args.add('proportion_of_exp_used_for_predictor_update', 1.) args.add('tag', '') args.add( 'policy', 'cnn', ) args.add('int_coeff', 1.) args.add('ext_coeff', 2.) args.add('dynamics_bonus', 0) if not debug: # TODO read more from specification print("running with neptune") neptune.init( project_qualified_name="pmtest/planning-with-learned-models") neptune.create_experiment( name=specification['name'], tags=specification['tags'], params=specification['parameters'], upload_stdout=False, upload_stderr=False, ) neptune.send_metric("test", 777) baselines_format_strs = ['log', 'csv'] else: print("running without neptune") baselines_format_strs = ['stdout', 'log', 'csv'] logger.configure(dir="out", format_strs=baselines_format_strs) seed = 10000 * args.seed # + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, env_size=args.env_size, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, use_neptune=(not debug))
parser.add_argument('algo_type', type=str) parser.add_argument('start', type=int) args = parser.parse_args() rs = 100 * (args.start + 1) np.random.seed(rs) with open('experiment_params.yaml', 'r') as stream: params = yaml.load(stream) env_type = 'mujoco' env_id = params[args.mjenv] params['algo_type'] = args.algo_type params['rseed'] = rs log_path = os.path.join('results_NPG', env_id, args.algo_type, 'rs_' + str(rs)) logger.configure(dir=os.path.join(os.getcwd(), log_path)) # dump the params once in the folder with open(os.path.join(log_path, 'params.yaml'), 'w') as outfile: yaml.dump(params, outfile, default_flow_style=False) env = make_vec_env(env_id, env_type, 1, rs) if args.algo_type == 'HOOF_All': learnt_model = learn_hoof_all( env, env_type, timesteps_per_batch=params['batch_size'], total_timesteps=params['total_ts'], kl_range=params['kl_bound'], gamma_range=params['discount_bound'],
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)) U.initialize() # Get dictionary from baselines/acktr/defaults defaults = defaults.mara_mlp() # Create needed folders try: logdir = defaults['trained_path'].split('checkpoints')[ 0] + 'results' + defaults['trained_path'].split('checkpoints')[1] except: logdir = '/tmp/ros2learn/' + defaults['env_name'] + '/acktr_results/' finally: logger.configure(os.path.abspath(logdir)) csvdir = logdir + "csv/" csv_files = [ csvdir + "det_obs.csv", csvdir + "det_acs.csv", csvdir + "det_rew.csv" ] if not os.path.exists(csvdir): os.makedirs(csvdir) else: for f in csv_files: if os.path.isfile(f): os.remove(f) def make_env(): env = gym.make(defaults['env_name'])
parser.add_argument('--actor-lr', type=float, default=0.01) parser.add_argument('--critic-lr', type=float, default=0.005) boolean_flag(parser, 'popart', default=False) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--reward-scale', type=float, default=1.) parser.add_argument('--clip-norm', type=float, default=None) parser.add_argument('--nb-epochs', type=int, default=1) # with default settings, perform 1M steps total parser.add_argument('--nb-epoch-cycles', type=int, default=1000) parser.add_argument('--nb-train-steps', type=int, default=1) # per epoch cycle and MPI worker parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=1) # per epoch cycle and MPI worker parser.add_argument('--noise-type', type=str, default='ou_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none parser.add_argument('--num-timesteps', type=int, default=None) boolean_flag(parser, 'evaluation', default=False) args = parser.parse_args() # we don't directly specify timesteps for this script, so make sure that if we do specify them # they agree with the other parameters if args.num_timesteps is not None: assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) dict_args = vars(args) del dict_args['num_timesteps'] return dict_args if __name__ == '__main__': args = parse_args() if MPI.COMM_WORLD.Get_rank() == 0: logger.configure(dir=os.path.join(get_default_data_directory("dppg_baselines_main_editted"))) # Run actual script. run(**args)
def main(args): set_global_seeds(args.seed) env = gym.make(args.env_id) if args.env_id == "MsPacman-v0": from gym import wrappers env = wrappers.Monitor( env, "/tmp", force=True, video_callable=False, ) from baselines.common.wrappers import wrap_deepmind env = wrap_deepmind(env) if args.encode_1d_obs: from vae.AE import Autoencoder from baselines.common.vae_encoding_wrapper import VAEEncodingWrapper ae = Autoencoder((84, 84, 4), [(64, 8, 2), (128, 6, 3), (128, 4, 2), (128, 3, 1)], [1000, 500], 100) ae.load_model("../../vae/runs/5e-4decay/model") U.update_initialized_parameters() env = VAEEncodingWrapper(env, ae) env.seed(args.seed) # env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json")) gym.logger.setLevel(logging.WARN) if args.log_dir != Log_dir: log_dir = osp.join(Log_dir, args.log_dir) save_dir = osp.join(Checkpoint_dir, args.log_dir) else: log_dir = Log_dir save_dir = Checkpoint_dir args, rnd_iter, dyn_norm = modify_args(args) logger.log(f"rnd_cnn_type: {args.rnd_cnn_type}") logger.log(f"policy_cnn_type: {args.policy_cnn_type}") logger.log(f"rnd_critic_scale: {args.rnd_critic_scale}") logger.log(f"policy_hidden_size: {args.policy_hidden_size}") def policy_fn(name, ob_space, ac_space): # return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, # hid_size=args.policy_hidden_size, num_hid_layers=2, popart=args.popart, gaussian_fixed_var=args.fixed_var) return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=[150, 50], num_hid_layers=2, popart=args.popart, gaussian_fixed_var=args.fixed_var, activation="relu") def policy_fn_cnn(name, ob_space, ac_space): return cnn_policy.CNNPolicy(name=name, policy_cnn_type=args.policy_cnn_type, ob_space=ob_space, ac_space=ac_space, hid_size=args.policy_hidden_size, num_hid_layers=2, popart=args.popart, gaussian_fixed_var=args.fixed_var) if args.task == 'train': if args.env_id == "MsPacman-v0": if args.encode_1d_obs: exp_data = get_exp_data_atari(DATASET_PATH, ae) else: exp_data = get_exp_data_atari(DATASET_PATH) else: exp_data = get_exp_data( osp.join(osp.dirname(osp.realpath(__file__)), "../../data/%s.pkl" % args.env_id)) task_name = get_task_name(args) task_name += "_rndcnn" + str(args.rnd_cnn_type) + "_mlpcnn" + str( args.policy_cnn_type) logger.configure(dir=log_dir, log_suffix=task_name, format_strs=["log", "stdout"]) if args.reward == 0: if args.env_id == "Humanoid-v2": critic = make_critic(env, exp_data, reward_type=args.reward, scale=2500) elif args.env_id == "Reacher-v2": critic = make_critic(env, exp_data, rnd_hid_size=20, hid_size=20, reward_type=args.reward, scale=2500) elif args.env_id == "HalfCheetah-v2": critic = make_critic(env, exp_data, rnd_hid_size=20, hid_size=20, reward_type=args.reward, scale=25000) elif args.env_id == "Ant-v2": critic = make_critic(env, exp_data, reward_type=args.reward) elif args.env_id == "MsPacman-v0": critic = make_critic(env, exp_data, hid_size=128, reward_type=args.reward, scale=args.rnd_critic_scale, CNN_critic=args.use_cnn, rnd_cnn_type=args.rnd_cnn_type) else: critic = make_critic(env, exp_data, reward_type=args.reward) else: if args.env_id == "Reacher-v2": critic = make_critic(env, exp_data, hid_size=100, reward_type=args.reward, scale=1000) if args.env_id == "Walker2d-v2": critic = make_critic(env, exp_data, hid_size=30, reward_type=args.reward, scale=100) if args.env_id == "HalfCheetah-v2": critic = make_critic(env, exp_data, hid_size=30, reward_type=args.reward, scale=1000) if args.env_id == "Hopper-v2": critic = make_critic(env, exp_data, hid_size=30, reward_type=args.reward, scale=1000) if args.env_id == "Ant-v2": critic = make_critic(env, exp_data, hid_size=128, reward_type=args.reward, scale=100) if args.env_id == "MsPacman-v0": critic = make_critic(env, exp_data, hid_size=128, reward_type=args.reward, scale=args.rnd_critic_scale, rnd_cnn_type=args.rnd_cnn_type) if args.use_cnn: policy = policy_fn_cnn else: policy = policy_fn train(env, args.seed, policy, critic, exp_data, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, save_dir, args.pretrained, args.BC_max_iter, args.gamma, rnd_iter, dyn_norm, task_name, args.use_cnn) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def main(): args = pybullet_arg_parser().parse_args() logger.configure(format_strs=['stdout', 'log', 'csv'], log_suffix="PPO_NAC_Advantage-" + args.env) logger.log("Algorithm: PPO_NAC_Advantage-" + args.env) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHumanWalker-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument( '--init_policy', help='Initial Policy', default= 'data/ppo_DartHumanWalker-v1241_energy15_vel5_5s_pdscale1_mirror4_up03fwd03ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_2s_dcon1_asinput_damping2kneethigh_thigh160knee100_curriculum_1xjoint_shoulder100_dqpen0_2kassist/policy_params.pkl' ) parser.add_argument('--init_curriculum', help='Initial Curriculum', nargs='+', default=[2000.0, 2000]) parser.add_argument( '--ref_policy', help='Reference Policy', default= 'data/ppo_DartHumanWalker-v1241_energy15_vel5_5s_pdscale1_mirror4_up03fwd03ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_2s_dcon1_asinput_damping2kneethigh_thigh160knee100_curriculum_1xjoint_shoulder100_dqpen0_2kassist/policy_params.pkl' ) parser.add_argument('--ref_curriculum', help='Reference Curriculum', nargs='+', default=[2000.0, 2000]) parser.add_argument('--anc_thres', help='Anchor Threshold', type=float, default=0.75) parser.add_argument('--prog_thres', help='Progress Threshold', type=float, default=0.6) parser.add_argument('--batch_size', help='Batch Size', type=int, default=2500) parser.add_argument('--max_iter', help='Maximum Iteration', type=int, default=2000) parser.add_argument('--use_reftraj', help='Use reference trajectory', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure( 'data/ppo_curriculum_150eachit_vel15_tvel1scale_up03fwd03ltl15_spinepen1_thighyawpen001_mirror4_runningavg1p5_2s_stride15_e1_' + args.env + '_' + str(args.seed) + '_' + str(args.anc_thres) + '_' + str(args.prog_thres) + '_' + str(args.batch_size)) sess = U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env) ob_space = env.observation_space ac_space = env.action_space def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array([ 0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8, 9, 10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, 28, 29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36, 37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52, 58, 57, 59 ]), action_permutation=np.array([ -6, 7, -8, 9, 10, 11, -0.001, 1, -2, 3, 4, 5, -12, 13, -14, -19, 20, -21, 22, -15, 16, -17, 18 ])) policy = policy_fn('policy', ob_space, ac_space) init_curriculum = np.array(args.init_curriculum) ref_policy = policy_fn('ref_policy', ob_space, ac_space) ref_curriculum = np.array(args.ref_curriculum) policy_params = joblib.load(args.init_policy) ref_policy_params = joblib.load(args.ref_policy) U.initialize() cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0]. name.find('/')] orig_scope = list( policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')] ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params. keys())[0].find('/')] for i in range(len(policy.get_variables())): assign_op = policy.get_variables()[i].assign( policy_params[policy.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) sess.run(assign_op) assign_op = ref_policy.get_variables()[i].assign( ref_policy_params[ref_policy.get_variables()[i].name.replace( 'ref_' + cur_scope, ref_scope, 1)]) sess.run(assign_op) anchor_threshold = args.anc_thres progress_threshold = args.prog_thres env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) curriculum_evolution = [] env.env.env.anchor_kp = ref_curriculum ref_score = None ref_max_score = None reference_trajectory = None #if MPI.COMM_WORLD.Get_rank() == 0: if args.use_reftraj == 1: reference_trajecotry = gen_reftraj(env, ref_policy, 299) env.env.reference_trajectory = reference_trajectory ref_score, ref_max_score = evaluate_policy(env, ref_policy, 24) ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0) ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0) reference_score = ref_score * progress_threshold reference_anchor_score = ref_score * anchor_threshold reference_max_score = ref_max_score * 0.9 env.env.env.anchor_kp = init_curriculum reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory current_curriculum = np.copy(init_curriculum) print('reference scores: ', reference_score, reference_anchor_score, reference_max_score) previous_params = policy_params for iter in range(args.max_iter): print('curriculum iter ', iter) print('ref score: ', reference_anchor_score) opt_pi, final_rew = pposgd_mirror.learn( env, policy_fn, max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150, timesteps_per_batch=int(args.batch_size), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, return_threshold=reference_anchor_score, init_policy_params=previous_params, policy_scope='pi' + str(iter), min_iters=0, reward_drop_bound=True, #max_threshold = reference_max_score, ) print('one learning iteration done') if np.linalg.norm(current_curriculum) >= 0.0001: # re-compute reference trajectory if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1: print('recompute reference traj') reference_trajecotry = gen_reftraj(env, opt_pi, 299) reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory if final_rew < reference_anchor_score * 0.95: print('update reference scores') reference_score = reference_score / reference_anchor_score * final_rew reference_anchor_score = final_rew closest_candidate = None #if MPI.COMM_WORLD.Get_rank() == 0: directions = [ np.array([-1, 0]), np.array([0, -1]), -current_curriculum / np.linalg.norm(current_curriculum) ] int_d1 = directions[0] + directions[2] int_d2 = directions[1] + directions[2] directions.append(int_d1 / np.linalg.norm(int_d1)) directions.append(int_d2 / np.linalg.norm(int_d2)) #directions = [np.array([0.0, -1.0])] # only search in one direction candidate_next_anchors = [] for direction in directions: found_point, perf = binary_search_curriculum( env, opt_pi, current_curriculum, direction, reference_score, reference_max_score, 6) print(direction, found_point, perf) candidate_next_anchors.append(found_point) if closest_candidate is None: closest_candidate = np.copy(found_point) elif np.linalg.norm(closest_candidate) > np.linalg.norm( found_point): closest_candidate = np.copy(found_point) if np.linalg.norm(closest_candidate) < 0.5: closest_candidate = np.array([0, 0]) if np.abs(closest_candidate[0]) < 0.1: closest_candidate[0] = 0.0 if np.abs(closest_candidate[1]) < 0.1: closest_candidate[1] = 0.0 #closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0) current_curriculum = np.copy(closest_candidate) env.env.env.anchor_kp = current_curriculum '''print('Update Init Pose Distributions') update_init_poses(env, opt_pi) if MPI.COMM_WORLD.Get_rank() == 0: joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True) joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)''' curriculum_evolution.append(current_curriculum) print('Current curriculum: ', current_curriculum) opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val if np.linalg.norm(current_curriculum) < 0.0001: if reference_anchor_score < ref_score: reference_anchor_score = ref_score else: break env.close()
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) parser.add_argument('--num_env', type=int, default=16) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.99) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=0) parser.add_argument('--dynamics_bonus', type=int, default=0) parser.add_argument('--clear-run', action='store_true', default=False, help='if clear the save folder') parser.add_argument('--mega-wrapper', type=int, default=0, help='if use the same wrapper as mega') args = parser.parse_args() args.save_dir = '../rnd_results/' args.save_dir = os.path.join(args.save_dir, 'e_n-{}/'.format(args.env)) args.save_dir = os.path.join( args.save_dir, 'mega_wrapper-{}'.format(str(args.mega_wrapper))) args.save_dir = os.path.join(args.save_dir, 'num_env-{}'.format(str(args.num_env))) args.save_dir = os.path.join(args.save_dir, 'int_coeff-{}'.format(str(args.int_coeff))) if args.clear_run: '''if clear_run, clear the path before create the path''' input('You have set clear_run, is that what you want?') subprocess.call(["rm", "-r", args.save_dir]) try: os.makedirs(args.save_dir) except Exception as e: print('file exists') try: os.makedirs('../rnd_log_results/' + args.env + '/') except Exception as e: print('log file exists') args.summary_writer = tf.summary.FileWriter(args.save_dir) logger.configure(dir='../rnd_log_results/' + args.env + '/', format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, args=args)
def make_env(): env = gym.make(env_name) env.set_episode_size(alg_kwargs['nsteps']) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir()), allow_early_resets=True) return env # Get dictionary from baselines/acktr/defaults alg_kwargs = defaults.mara_mlp() env_name = alg_kwargs['env_name'] alg_kwargs['total_timesteps'] = alg_kwargs['nsteps'] # Generate tensorboard file format_strs = os.getenv('MARA_LOG_FORMAT', 'stdout,log,csv,tensorboard').split(',') logger.configure(os.path.abspath('/tmp/acktr'), format_strs) env = DummyVecEnv([make_env]) # Remove unused parameters for training alg_kwargs.pop('env_name') alg_kwargs.pop('trained_path') alg_kwargs.pop('transfer_path') network = mlp(num_layers=alg_kwargs['num_layers'], num_hidden=alg_kwargs['num_hidden'], layer_norm=alg_kwargs['layer_norm']) with tf.Session(config=config) as train_sess: _ = acktr.learn(env=env, network=network, **alg_kwargs) tf.reset_default_graph()
def main(): args = mujoco_arg_parser().parse_args() logger.configure() parameters = algorithm_parameters() train(args.env, parameters=parameters, seed=args.seed)
def main(): args = gym_ctrl_arg_parser().parse_args() logger.configure(format_strs=['stdout', 'log', 'csv'], log_suffix = "ACKTR-"+args.env) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
def main(args=None): # configure logger, disable logging in child MPI processes (with rank > 0) if args is None: from thesis_galljamov18.python.training.guro_train import LOAD_MODEL arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() extra_args = { k: parse(v) for k, v in parse_unknown_args(unknown_args).items() } """All args: {'nsteps': 2048, 'nminibatches': 32, 'lam': 0.95, 'gamma': 0.99, 'noptepochs': 10, 'log_interval': 1, 'ent_coef': 0.0, 'lr': <function mujoco.<locals>.<lambda> at 0x7f8f5af49f28>, 'cliprange': 0.2, 'value_network': 'copy'}""" # train my environment instead default one args.env = "Guro-v0" args.num_timesteps = 0 if LOAD_MODEL else 10e6 + 1e5 args.play = LOAD_MODEL args.alg = 'ppo2' args.network = 'mlp' # change further arguments # nsteps = 2048 # nminibatches = 32 # gamma = 0.95 # lr = 0.001 # cliprange = 0.2 # extra_args.update({'nsteps': nsteps, 'nminibatches': nminibatches, 'gamma': gamma, 'cliprange': cliprange}) # extra_args.update({'lr': 1e-10}) else: extra_args = {} if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure() else: logger.configure(format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, _ = train(args, extra_args) if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("\n------------\nRunning trained model\n------------\n") def say(text): os.system( 'spd-say "{}" --volume -1 --voice-type male2'.format(text)) # say("Attention please! Running trained model in 10 seconds!") # import time # time.sleep(10) env = build_env(args) obs = env.reset() #env.ob_rms.mean = [0,0,0,0,0,0] #[0., 0.39362465587763634, 0., -0.11370739423088674, 0.01929697539211253, 0.5066570016460371] # [ 0, 0.46073392, 0, 0.20411958, -0.05412459, 0.49079091] # print("\n----------\nOBSERV_MEANS of loaded model: " + str(env.ob_rms.mean) + "\n----------\n") # exit(33) while True: actions = model.step(obs)[0] obs, _, done, _ = env.step(actions) env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: obs = env.reset()
def main(args): arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args(args) import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" extra_args = parse_cmdline_kwargs(unknown_args) if 'gpu-id' in extra_args: os.environ["CUDA_VISIBLE_DEVICES"] = str(extra_args['gpu-id']) extra_args.pop('gpu-id') if 'num_trials' in extra_args: num_trials = extra_args.pop('num_trials') else: num_trials = 1000 if 'mle' in extra_args: if extra_args['mle']: args.use_mle = True extra_args.pop('mle') else: args.use_mle = False print("mle", args.use_mle) if 'residual_weight' not in extra_args and (args.alg == 'bppo2_expert' or args.alg == 'bppo2'): print("residual_weight not in extra_args, set it to 0.1") extra_args['residual_weight'] = 0.1 if 'residual_weight' in extra_args: print("Residual weight", extra_args["residual_weight"]) if 'render' in extra_args: render = True del extra_args['render'] else: render = False if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure() else: logger.configure(format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, env = train(args, extra_args) env.close() if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("Running trained model") env = build_env(args) obs = env.reset() def initialize_placeholders(nlstm=128, **kwargs): return np.zeros((args.num_env or 1, 2 * nlstm)), np.zeros((1)) state, dones = initialize_placeholders(**extra_args) # GL: Get mean, std from baselines.common.math_util import discount all_rewards = [] if 'tiger' in args.env: from brl_gym.envs.tiger import ACTION_NAME, OBS_NAME for _ in range(10): obs = env.reset() rewards = [] for t in range(100): tiger_loc = env.envs[0].env.env.env.tiger tiger = "LEFT" if tiger_loc == 0 else "RIGHT" actions, _, state, _ = model.step(obs[0], S=state, M=dones) obs, r, done, _ = env.step(actions[0]) obs_name = OBS_NAME[np.argmax(obs[0, :3])] print("Reward: {}\tAction: {}\tObs: {}\tHidden: {}".format( r, ACTION_NAME[actions[0]], obs_name, tiger)) done = done.any() if isinstance(done, np.ndarray) else done rewards += [r] if done: print("=========== RESET ========== ") all_rewards += [discount(np.array(rewards).ravel(), 0.95)[0]] elif 'rocksample' in args.env: if 'gamma' not in extra_args: extra_args['gamma'] = 1.0 if 'fixed' in args.alg: from brl_gym.qmdps.rocksample_qmdp import RockSampleQMDPQFunction as QMDPQFunction q_func = QMDPQFunction(num_rocks=8, num_envs=args.num_env) else: qval = None for _ in range(num_trials): obs = env.reset() obs, bel = obs[:, :148], obs[:, 148:] qval = q_func(obs, bel) done = False rewards = [] while not done: action = model.step(obs, belief=bel, S=state, M=dones, expert_qval=qval, update_eps=0)[0][0] obs, r, done, _ = env.step(action) obs, bel = obs[:, :148], obs[:, 148:] qval = q_func(obs, bel) # env.render() # print(action, r) done = done.any() if isinstance(done, np.ndarray) else done rewards += [r] all_rewards += [ discount(np.array(rewards).ravel(), extra_args['gamma'])[0] ] elif args.alg == 'bddpg_fe': if 'gamma' not in extra_args: extra_args['gamma'] = 1.0 for _ in range(num_trials): obs = env.reset() done = False rewards = [] t = 0 #from brl_gym.wrapper_envs.wrapper_pusher import get_qmdp_expert obs_dim = 22 from brl_gym.wrapper_envs.wrapper_pusher import qmdp_expert, simple_combined_expert while not done: # action = model.step(obs,S=state, M=dones)[0][0] # print(action[0], r[0], done[0], q[0]) obs = obs.reshape(1, -1) qval = qmdp_expert(obs[:, :obs_dim], obs[:, obs_dim:]) action = model.step(obs, qval, apply_noise=False)[0][0] action = 0.1 * action + simple_combined_expert( obs[:, :obs_dim], obs[:, obs_dim:]) obs, r, done, _ = env.step(action) env.render() done = done.any() if isinstance(done, np.ndarray) else done rewards += [r] t += 1 #if t >=800: # break print("T: ", t) all_rewards += [ discount(np.array(rewards).ravel(), extra_args['gamma'])[0] ] print(all_rewards) else: if 'gamma' not in extra_args: extra_args['gamma'] = 0.99 if 'Maze' in args.env: from brl_gym.wrapper_envs.wrapper_maze import Expert maze_type = 10 if 'Maze10' in args.env else 4 expert = Expert(nenv=1, maze_type=maze_type) else: from brl_gym.experts.util import get_expert expert = get_expert( args.env, use_mle=args.use_mle, num_env=args.num_env, ) undiscounted_sum = [] #import cProfile with open(args.output, 'w') as f: #if 'Maze' in args.env : # f.write('target\treward\tnum-sensing\tlength\n') #else: # f.write('reward\tnum-sensing\tnum-collision\tlength\n') actual_env = env.envs[0].env.env.env #env = env.envs[0].env.env no_collision = 0 lengths = [] for k in range(num_trials): #profile = cProfile.Profile() #profile.enable() print('-------------------------') # env.envs[0].env.env.env.env.target = 3 # env.envs[0].env.env.env.reset_params=False obs = env.reset() #_env = env.envs[0] #while hasattr(_env, "env"): # _env = _env.env #if 'Maze' in args.env: # target = _env.target done = False rewards = [] residual_actions = [] obses = [] info = [] t = 0 expert_actions = [] agent_pos = [] observations = [] w = extra_args['residual_weight'] print("Weight", w) while not done and t < 500: #print("obs :", np.around(obs, 1)) if args.alg == 'bppo2_expert': expert_action = expert.action(obs, info) obs = np.concatenate([obs, expert_action], axis=1) expert_action = expert_action.ravel() observations += [obs.copy()] action = model.step(obs)[0][0].numpy() residual_actions += [action] if args.alg == 'bppo2_expert': agent_pos += [obs.ravel()[:2]] expert_actions += [expert_action.copy()] #print("action", action, "expert", expert_action,) if 'cartpole' in args.env.lower(): expert_action = expert_action + action * w else: expert_action = ( 1.0 - w) * expert_action + action * w action = expert_action action = np.clip(action, env.action_space.low, env.action_space.high) #print("final action", action) obs, r, done, info = env.step(action) #print('reward:', r) #print('done :', done) if render: os.makedirs('imgs/trial{}'.format(k), exist_ok=True) actual_env._visualize( filename="imgs/trial{}/crosswalk_{}.png". format(k, t)) t += 1 done = done.any() if isinstance(done, np.ndarray) else done rewards += [r] obses += [obs] # actions += [action] #profile.disable() #profile.print_stats() #import IPython; IPython.embed(); import sys ;sys.exit(0) lengths += [t] rewards = np.array(rewards).ravel() if rewards[-1] > 0: no_collision += 1 print(np.sum(rewards), no_collision) #residual_actions = np.array(residual_actions).squeeze() #observations = np.array(observations).squeeze() #data = {"r":rewards, "action":residual_actions, "obs":observations} #os.makedirs('trials', exist_ok=True) #data_file = open("trials/trial_{}.pkl".format(k), 'wb+') #pickle.dump(data, data_file) #print("Wrote to trial_{}.pkl".format(k)) all_rewards += [np.sum(rewards)] env.close() mean = np.mean(all_rewards) ste = np.std(all_rewards) / np.sqrt(len(all_rewards)) print(all_rewards) print("Reward stat: ", mean, "+/-", ste) print("No collision", no_collision / num_trials) print("Length", np.mean(lengths)) return model
def learn(env, eval_env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, reward_coeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, num_epochs=1000, eval_interval=10, callback=None): # Configure log logger.configure(dir=log_dir) nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- # seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) seg_gen = traj_segment_generator(pi, env, reward_giver, reward_coeff, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) for epoch in range(num_epochs): logger.log("********** Epoch %i ************" % epoch) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) # evaluate current policy if (epoch + 1) % eval_interval == 0: total_samples = (epoch + 1) * timesteps_per_batch * g_step evaluate_policy(pi, reward_giver, eval_env, total_samples, tstart) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
def main(): args = atari_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) #parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--dueling', type=int, default=0) #parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default='/.') args = parser.parse_args() # TODO change logging dir for tensorboard #logger.configure(dir=None, format_strs='stdout,log,csv,json,tensorboard') #logger.configure(dir=None, format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard']) timestart = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M:%S') logger.configure( dir=PROJ_DIR + "/../tensorboard/" + str(timestart), format_strs=['stdout', 'log', 'csv', 'json', 'tensorboard']) logger.set_level(logger.INFO) set_global_seeds(args.seed) env = gym_super_mario_bros.make('SuperMarioBros-v1') #wrap environment env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) #record videos of an episode env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50) #the agent has only one trial env = EpisodicLifeEnv(env) # nes_py #preprocess the input frame env = DownsampleEnv(env, (84, 84)) #set death penalty env = PenalizeDeathEnv(env, penalty=-25) #Stack 4 Framse as input env = FrameStackEnv(env, 4) #print tensorboard log information print("logger.get_dir():", logger.get_dir()) print("PROJ_DIR:", PROJ_DIR) act = None #enable output in the terminal env = bench.Monitor(env, logger.get_dir()) modelname = datetime.datetime.now().isoformat() #define callback function for the training process def render_callback(lcl, _glb): # print(lcl['episode_rewards']) total_steps = lcl['env'].total_steps #if total_steps % 2000 == 0: env.render() # pass #different models with different parameters. out commented #CNN built deepq.models.with cnn_to_mlp(params) #trained with deepq.learn(params) #2018-08-12-10:25:50 model 4, 100k, lr 0.0005, alpha 0.6, gamma 0.99, 8 frames v1 #2018-08-12-11:31:59 model 4, 100k, lr 0.0005, alpha 0.8, gamma 0.99, 6 frames v1 # model 04 # nature human paper + Improvements # Dueling Double DQN, Prioritized Experience Replay, and fixed Q-targets model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # (num_outputs, kernel_size, stride) hiddens=[512], # 512 dueling=bool(1), ) act = deepq.learn( env, q_func=model, lr=0.0001, # 0.00025 1e-4 max_timesteps=int(100000), # 100k -> 3h buffer_size=50000, # 5000, #10000 exploration_fraction=0.3, # 0.1, exploration_final_eps=0.1, # 0.01 train_freq=4, # 4 learning_starts=25000, # 10000 target_network_update_freq=1000, gamma=0.5, #0.99, prioritized_replay=bool(1), prioritized_replay_alpha=0.2, checkpoint_freq=args.checkpoint_freq, # checkpoint_path=args.checkpoint_path, callback=render_callback, print_freq=1) print("Saving model to mario_model.pkl " + timestart) act.save("../models/mario_model_{}.pkl".format(timestart)) env.close()
# parser.add_argument('--noise_type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none parser.add_argument('--noise_type', type=str, default='ou_0.2') boolean_flag(parser, 'evaluation', default=False) args = parser.parse_args() sess = U.single_threaded_session() sess.__enter__() # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(str(args.environment)) logger.configure("/tmp/experiments/"+str(args.environment)+"/DDPG/") env = bench.Monitor(env, logger.get_dir()) if args.evaluation and rank==0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # gym.logger.setLevel(logging.WARN) # if evaluation and rank==0: # eval_env = gym.make(env_id) # eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) # env = bench.Monitor(env, None) # Parse noise_type
import gym from baselines import logger import numpy as np if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartHexapod-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_' + args.env + str(args.seed) + '_walk') env = gym.make(args.env) env.env.assist_timeout = 100.0 env.env.target_vel = 2.0 env.env.init_tv = 0.0 env.env.final_tv = 2.0 env.env.tv_endtime = 1.0 env.env.energy_weight = 0.2 env.env.alive_bonus = 4.0 train_mirror_sig(env, num_timesteps=int(5000000), seed=args.seed, obs_perm=np.array([ 0.0001, -1, 2, -3, -4, 8, 9, 10, 5, 6, 7, 14, 15, 16, 11, 12, 13, 20, 21, 22, 17, 18, 19, 23, 24, -25, 26,
def main(): args = mujoco_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
def main(): """Run DQN until the environment throws an exception.""" # Hyperparameters num_envs = 64 learning_rate = 2.5e-4 gamma = 0.99 nstep_return = 3 timesteps_per_proc = 25_000_000 train_interval = 64 target_interval = 8192 batch_size = 512 min_buffer_size = 20000 # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--mix_beta', type=float, default=0.2) parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--data_aug', type=str, default='no_aug', choices=['no_aug', 'cutout_color', 'crop']) args = parser.parse_args() # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=LOG_DIR + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup Rainbow models logger.info("building models") online_net, target_net = rainbow_models( sess, venv.action_space.n, gym_space_vectorizer(venv.observation_space), min_val=REWARD_RANGE_FOR_C51[env_name][0], max_val=REWARD_RANGE_FOR_C51[env_name][1]) dqn = MpiDQN(online_net, target_net, discount=gamma, comm=comm, mpi_rank_weight=mpi_rank_weight, mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, mix_beta=args.mix_beta, use_l2reg=args.use_l2reg, data_aug=args.data_aug) player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return) optimize = dqn.optimize(learning_rate=learning_rate) # Initialize and sync variables sess.run(tf.global_variables_initializer()) global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if comm.Get_size() > 1: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E110 # Training logger.info("training") dqn.train(num_steps=timesteps_per_proc, player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=train_interval, target_interval=target_interval, batch_size=batch_size, min_buffer_size=min_buffer_size)
def train(): rank = MPI.COMM_WORLD.Get_rank() sess = utils.make_gpu_session(args.num_gpu) sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) if args.use_2D_env: config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'configs', 'husky_space7_ppo2_2D.yaml') else: config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'configs', 'husky_space7_ppo2.yaml') if args.use_2D_env: raw_env = Husky2DNavigateEnv(gpu_idx=args.gpu_idx, config=config_file, pos_interval=args.pos_interval) else: raw_env = Husky1DNavigateEnv(gpu_idx=args.gpu_idx, config=config_file, ob_space_range=[0.0, 40.0]) env = Monitor( raw_env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) base_dirname = os.path.join(currentdir, "simulation_and_analysis_dqn", "rslts") if not os.path.exists(base_dirname): os.makedirs(base_dirname) dir_name = "husky_dqn_" if args.use_feedback: dir_name += "hr" elif args.use_rich_reward: dir_name += "rl_rich" else: dir_name += "rl_sparse" dir_name = addDateTime(dir_name) dir_name = os.path.join(base_dirname, dir_name) if not os.path.exists(dir_name): os.mkdir(dir_name) hyperparams = { "seed": args.seed, # env "use_2D_env": args.use_2D_env, "use_rich_reward": args.use_rich_reward, "use_multiple_starts": args.use_multiple_starts, "total_timesteps": args.total_timesteps, "pos_interval": args.pos_interval, # hr "use_feedback": args.use_feedback, "use_real_feedback": args.use_real_feedback, "trans_by_interpolate": args.trans_by_interpolate, "only_use_hr_until": args.only_use_hr_until, "trans_to_rl_in": args.trans_to_rl_in, "good_feedback_acc": args.good_feedback_acc, "bad_feedback_acc": args.bad_feedback_acc, # dqn "exploration_fraction": args.exploration_fraction, "exploration_final_eps": args.exploration_final_eps, "lr": args.lr, "batch_size": args.batch_size, "dqn_epochs": args.dqn_epochs, "train_freq": args.train_freq, "target_network_update_freq": args.target_network_update_freq, "learning_starts": args.learning_starts, "param_noise": args.param_noise, "gamma": args.gamma, # hr training "feedback_lr": args.feedback_lr, "feedback_epochs": args.feedback_epochs, "feedback_batch_size": args.feedback_batch_size, "feedback_minibatch_size": args.feedback_minibatch_size, "min_feedback_buffer_size": args.min_feedback_buffer_size, "feedback_training_prop": args.feedback_training_prop, "feedback_training_new_prop": args.feedback_training_new_prop, # dqn replay buffer "buffer_size": args.buffer_size, "prioritized_replay": args.prioritized_replay, "prioritized_replay_alpha": args.prioritized_replay_alpha, "prioritized_replay_beta0": args.prioritized_replay_beta0, "prioritized_replay_beta_iters": args.prioritized_replay_beta_iters, "prioritized_replay_eps": args.prioritized_replay_eps, # "checkpoint_freq": args.checkpoint_freq, "use_embedding": raw_env._use_embedding, "use_raycast": raw_env._use_raycast, "offline": raw_env.config['offline'] } print_freq = 5 param_fname = os.path.join(dir_name, "param.json") with open(param_fname, "w") as f: json.dump(hyperparams, f, indent=4, sort_keys=True) video_name = os.path.join(dir_name, "video.mp4") p_logging = p.startStateLogging(p.STATE_LOGGING_VIDEO_MP4, video_name) act, performance = learn( # env flags env, raw_env, use_2D_env=args.use_2D_env, use_multiple_starts=args.use_multiple_starts, use_rich_reward=args.use_rich_reward, total_timesteps=args.total_timesteps, # dqn exploration_fraction=args.exploration_fraction, exploration_final_eps=args.exploration_final_eps, # hr use_feedback=args.use_feedback, use_real_feedback=args.use_real_feedback, only_use_hr_until=args.only_use_hr_until, trans_to_rl_in=args.trans_to_rl_in, good_feedback_acc=args.good_feedback_acc, bad_feedback_acc=args.bad_feedback_acc, # dqn training lr=args.lr, batch_size=args.batch_size, dqn_epochs=args.dqn_epochs, train_freq=args.train_freq, target_network_update_freq=args.target_network_update_freq, learning_starts=args.learning_starts, param_noise=args.param_noise, gamma=args.gamma, # hr training feedback_lr=args.feedback_lr, feedback_epochs=args.feedback_epochs, feedback_batch_size=args.feedback_batch_size, feedback_minibatch_size=args.feedback_minibatch_size, min_feedback_buffer_size=args.min_feedback_buffer_size, feedback_training_prop=args.feedback_training_prop, feedback_training_new_prop=args.feedback_training_new_prop, # replay buffer buffer_size=args.buffer_size, prioritized_replay=args.prioritized_replay, prioritized_replay_alpha=args.prioritized_replay_alpha, prioritized_replay_beta0=args.prioritized_replay_beta0, prioritized_replay_beta_iters=args.prioritized_replay_beta_iters, prioritized_replay_eps=args.prioritized_replay_eps, # rslts saving and others checkpoint_freq=args.checkpoint_freq, print_freq=print_freq, checkpoint_path=None, load_path=None, callback=None, seed=args.seed) p.stopStateLogging(p_logging) performance_fname = os.path.join(dir_name, "performance.p") with open(performance_fname, "wb") as f: pickle.dump(performance, f) act.save(os.path.join(dir_name, "cartpole_model.pkl"))
def train_fn(env_name, num_envs, distribution_mode, num_levels, start_level, timesteps_per_proc, args, is_test_worker=False, log_dir='./tmp/procgen', comm=None, alternate_ppo=False, do_eval=False, eval_num_envs=None, eval_env_name=None, eval_num_levels=None, eval_start_level=None, eval_distribution_mode=None, do_test=False, test_num_envs=None, test_env_name=None, test_num_levels=None, test_start_level=None, test_distribution_mode=None): learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 use_vf_clipping = True mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else num_levels if log_dir is not None: log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(comm=log_comm, dir=log_dir, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) eval_env = None if do_eval: eval_env = ProcgenEnv(num_envs=eval_num_envs, env_name=eval_env_name, num_levels=eval_num_levels, start_level=eval_start_level, distribution_mode=eval_distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False) test_env = None if do_test: test_env = ProcgenEnv(num_envs=test_num_envs, env_name=test_env_name, num_levels=test_num_levels, start_level=test_start_level, distribution_mode=test_distribution_mode) test_env = VecExtractDictObs(test_env, "rgb") test_env = VecMonitor( venv=test_env, filename=None, keep_buf=100, ) test_env = VecNormalize(venv=test_env, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) logger.info("training") if alternate_ppo: alt_ppo2.learn(env=venv, eval_env=eval_env, test_env=test_env, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=1, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, args=args, load_path=args.resume_path) else: ppo2.learn(env=venv, eval_env=eval_env, network=conv_fn, total_timesteps=timesteps_per_proc, save_interval=1, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=0.5, max_grad_norm=0.5, args=args)
def main(): # Hyperparameters num_envs = 128 learning_rate = 5e-4 ent_coef = .01 vf_coef = 0.5 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 max_grad_norm = 0.5 timesteps_per_proc = 100_000_000 use_vf_clipping = True # Parse arguments parser = argparse.ArgumentParser( description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument( '--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--run_id', type=int, default=1) parser.add_argument('--gpus_id', type=str, default='') parser.add_argument('--use_bn', action='store_true') parser.add_argument('--use_l2reg', action='store_true') parser.add_argument('--l2reg_coeff', type=float, default=1e-4) parser.add_argument('--data_aug', type=str, default='no_aug', choices=["no_aug", "cutout_color", "crop"]) parser.add_argument('--use_rand_conv', action='store_true') parser.add_argument('--model_width', type=str, default='1x', choices=["1x", "2x", "4x"]) parser.add_argument('--level_setup', type=str, default='procgen', choices=["procgen", "oracle"]) parser.add_argument('--mix_mode', type=str, default='nomix', choices=['nomix', 'mixreg', 'mixobs']) parser.add_argument('--mix_alpha', type=float, default=0.2) parser.add_argument('--timesteps_per_proc', type=float, default=1_000_000) parser.add_argument('--save_dir', type=str, default='gdrive/MyDrive/182 Project/mixreg') args = parser.parse_args() log_dir = args.save_dir # Setup test worker comm = MPI.COMM_WORLD rank = comm.Get_rank() test_worker_interval = args.test_worker_interval is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == ( test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 # Setup env specs if args.level_setup == "procgen": env_name = args.env_name num_levels = 0 if is_test_worker else args.num_levels start_level = args.start_level elif args.level_setup == "oracle": env_name = args.env_name num_levels = 0 start_level = args.start_level # Setup logger log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure( dir=log_dir + f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}', format_strs=format_strs) # Create env logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) eval_env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=500, start_level=0, distribution_mode=args.distribution_mode) eval_env = VecExtractDictObs(eval_env, "rgb") eval_env = VecMonitor( venv=eval_env, filename=None, keep_buf=100, ) eval_env = VecNormalize(venv=eval_env, ob=False, ret=True) # Setup Tensorflow logger.info("creating tf session") if args.gpus_id: gpus_id = [x.strip() for x in args.gpus_id.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank] setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() # Setup model if args.model_width == '1x': depths = [16, 32, 32] elif args.model_width == '2x': depths = [32, 64, 64] elif args.model_width == '4x': depths = [64, 128, 128] conv_fn = lambda x: build_impala_cnn(x, depths=depths, use_bn=args.use_bn, randcnn=args.use_rand_conv and not is_test_worker) # Training logger.info("training") ppo2.learn = learn # use customized "learn" function model = ppo2.learn( env=venv, network=conv_fn, total_timesteps=args.timesteps_per_proc, eval_env=eval_env, save_interval=0, nsteps=nsteps, nminibatches=nminibatches, lam=lam, gamma=gamma, noptepochs=ppo_epochs, log_interval=1, ent_coef=ent_coef, mpi_rank_weight=mpi_rank_weight, clip_vf=use_vf_clipping, comm=comm, lr=learning_rate, cliprange=clip_range, update_fn=None, init_fn=None, vf_coef=vf_coef, max_grad_norm=max_grad_norm, data_aug=args.data_aug, use_rand_conv=args.use_rand_conv, model_fn=get_mixreg_model(mix_mode=args.mix_mode, mix_alpha=args.mix_alpha, use_l2reg=args.use_l2reg, l2reg_coeff=args.l2reg_coeff), ) # Saving logger.info("saving final model") if rank == 0: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) model.save(os.path.join(checkdir, 'final_model.ckpt'))
def main(): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() if args.env == 'Humanoid-v1' or args.env == 'Humanoid(rllab)' or args.env == 'HumanoidStandup-v1': args.num_timesteps = 1e7 if args.env == 'Ant-v1': args.num_timesteps = 5e6 extra_args = parse_cmdline_kwargs(unknown_args) print("args") print(args) if args.num_repeat==1: dir = args.log_dir + '/iter%d' % args.seed if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure(dir=dir) else: logger.configure(dir=dir, format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, env, sess, evalenv = train(args, extra_args, args.seed) env.close() evalenv.close() tf.reset_default_graph() sess.close() else: for seed in range(args.num_repeat): dir = args.log_dir + '/iter%d'%seed if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure(dir=dir) else: logger.configure(dir=dir, format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, env, sess, evalenv = train(args, extra_args, seed) env.close() evalenv.close() tf.reset_default_graph() sess.close() if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("Running trained model") env = build_env(args) obs = env.reset() def initialize_placeholders(nlstm=128,**kwargs): return np.zeros((1, 2*nlstm)), np.zeros((1)) state, dones = initialize_placeholders(**extra_args) while True: actions, _, state, _ = model.step(obs,S=state, M=dones) obs, _, done, _ = env.step(actions) env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: obs = env.reset() env.close()
def configure_logger(log_path, **kwargs): if log_path is not None: logger.configure(log_path) else: logger.configure(**kwargs)
parser.add_argument('--actor-lr', type=float, default=1e-4) parser.add_argument('--critic-lr', type=float, default=1e-3) boolean_flag(parser, 'popart', default=False) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--reward-scale', type=float, default=1.) parser.add_argument('--clip-norm', type=float, default=None) parser.add_argument('--nb-epochs', type=int, default=500) # with default settings, perform 1M steps total parser.add_argument('--nb-epoch-cycles', type=int, default=20) parser.add_argument('--nb-train-steps', type=int, default=50) # per epoch cycle and MPI worker parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none parser.add_argument('--num-timesteps', type=int, default=None) boolean_flag(parser, 'evaluation', default=False) args = parser.parse_args() # we don't directly specify timesteps for this script, so make sure that if we do specify them # they agree with the other parameters if args.num_timesteps is not None: assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) dict_args = vars(args) del dict_args['num_timesteps'] return dict_args if __name__ == '__main__': args = parse_args() if MPI.COMM_WORLD.Get_rank() == 0: logger.configure() # Run actual script. run(**args)