def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1): print("Running Acer Simple") print(locals()) tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size) else: buffer = None nbatch = nenvs*nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this env.close()
def train(env_id, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) nbatch = nenvs*nsteps tstart = time.time() for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() env.close()
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'): def make_env(rank): def _thunk(): if env_id == "TestEnv": env = TestEnv(renderer=renderer) #gym.make(env_id) else: env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) # only clip rewards when not evaluating return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_env)]) env.reset() start = time.time() for i in range(num_timesteps): action = [env.action_space.sample() for _ in range(num_env)] env.step(action) stop = time.time() duration = (stop - start) if (duration): fps = num_timesteps / duration else: fps = 0 env.close() return num_env, fps
def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): pretrained_weight = None if pretrained and (BC_max_iter > 0): # Pretrain with behavior cloning from baselines.gail import behavior_clone pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=BC_max_iter) if algo == 'trpo': from baselines.gail import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, pretrained=pretrained, pretrained_weight=pretrained_weight, g_step=g_step, d_step=d_step, entcoeff=policy_entcoeff, max_timesteps=num_timesteps, ckpt_dir=checkpoint_dir, log_dir=log_dir, save_per_iter=save_per_iter, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-3, task_name=task_name) else: raise NotImplementedError
def train(env_id, num_frames, seed): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json"%rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(num_timesteps, seed): num_cpus = 1 num_casks = 1 num_cpus += num_casks config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_cpus, inter_op_parallelism_threads=num_cpus) tf.Session(config=config).__enter__() gamma = 0.995 env = RemoteVecEnv([make_env] * num_cpus) env = VecNormalize(env, ret=True, gamma=gamma) set_global_seeds(seed) policy = policies.MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=num_cpus-num_casks, lam=0.95, gamma=gamma, noptepochs=4, log_interval=1, vf_coef=0.5, ent_coef=0.0, lr=3e-4, cliprange=0.2, save_interval=2, load_path="./logs/course_6/00244", total_timesteps=num_timesteps, num_casks=num_casks)
def train(env_id, num_frames, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = Monitor(env, logger.get_dir()) env.seed(seed) return env
def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) env.seed(seed) return env
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) print('Evaluating {}'.format(args.env)) bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, args.stochastic_policy, False, 'BC') print('Evaluation for {}'.format(args.env)) print(bc_log) gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, args.stochastic_policy, True, 'gail') print('Evaluation for {}'.format(args.env)) print(gail_log) plot(args.env, bc_log, gail_log, args.stochastic_policy)
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def make_mujoco_env(env_id, seed, reward_scale=1.0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() myseed = seed + 1000 * rank if seed is not None else None set_global_seeds(myseed) env = gym.make(env_id) logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank)) env = Monitor(env, logger_path, allow_early_resets=True) env.seed(seed) if reward_scale != 1.0: from baselines.common.retro_wrappers import RewardScaler env = RewardScaler(env, reward_scale) return env
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear'): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) env.close()
def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) env.close()
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed): env=gym.make(env_id) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def test(args): import filelock with filelock.FileLock('/tmp/robotstify.lock'): import gym import sys try: import goexplore_py.complex_fetch_env except Exception: print('Could not import complex_fetch_env, is goexplore_py in PYTHONPATH?') import tensorflow as tf import horovod.tensorflow as hvd hvd.init() print('initialized worker %d' % hvd.rank(), flush=True) from baselines.common import set_global_seeds set_global_seeds(hvd.rank()) from baselines import bench from baselines.common import set_global_seeds from atari_reset.wrappers import VecFrameStack, VideoWriter, my_wrapper,\ EpsGreedyEnv, StickyActionEnv, NoopResetEnv, SubprocVecEnv, PreventSlugEnv, FetchSaveEnv, TanhWrap from atari_reset.ppo import learn from atari_reset.policies import CnnPolicy, GRUPolicy, FFPolicy set_global_seeds(hvd.rank()) ncpu = 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.Session(config=config).__enter__() max_noops = 30 if args.noops else 0 print('SAVE PATH', args.save_path) def make_env(rank): def env_fn(): if args.game == 'fetch': assert args.fetch_target_location is not None, 'For now, we require a target location for fetch' kwargs = {} dargs = vars(args) for attr in dargs: if attr.startswith('fetch_'): if attr == 'fetch_type': kwargs['model_file'] = f'teleOp_{args.fetch_type}.xml' elif attr != 'fetch_total_timestep': kwargs[attr[len('fetch_'):]] = dargs[attr] env = goexplore_py.complex_fetch_env.ComplexFetchEnv( **kwargs ) elif args.game == 'fetch_dumb': env = goexplore_py.dumb_fetch_env.ComplexFetchEnv() else: env = gym.make(args.game + 'NoFrameskip-v4') if args.seed_env: env.seed(0) # if args.unlimited_score: # # This removes the TimeLimit wrapper around the env # env = env.env # env = PreventSlugEnv(env) # change for long runs # env._max_episode_steps *= 1000 env = bench.Monitor(env, "{}.monitor.json".format(rank), allow_early_resets=True) if False and rank%nenvs == 0 and hvd.local_rank()==0: os.makedirs(args.save_path + '/vids/' + args.game, exist_ok=True) videofile_prefix = args.save_path + '/vids/' + args.game env = VideoWriter(env, videofile_prefix) if 'fetch' not in args.game: if args.noops: os.makedirs(args.save_path, exist_ok=True) env = NoopResetEnv(env, 30, nenvs, args.save_path, num_per_noop=args.num_per_noop, unlimited_score=args.unlimited_score) env = my_wrapper(env, clip_rewards=True, sticky=args.sticky) if args.epsgreedy: env = EpsGreedyEnv(env) else: os.makedirs(f'{args.save_path}', exist_ok=True) env = FetchSaveEnv(env, rank=rank, n_ranks=nenvs, save_path=f'{args.save_path}/', demo_path=args.demo) env = TanhWrap(env) # def print_rec(e): # print(e.__class__.__name__) # if hasattr(e, 'env'): # print_rec(e.env) # import time # import random # time.sleep(random.random() * 10) # print('\tSHOWING STUFF') # print_rec(env) # print('\n\n\n') return env return env_fn nenvs = args.nenvs env = SubprocVecEnv([make_env(i + nenvs * hvd.rank()) for i in range(nenvs)]) env = VecFrameStack(env, 1 if 'fetch' in args.game else 4) if 'fetch' in args.game: print('Fetch environment, using the feedforward policy.') args.policy = FFPolicy else: args.policy = {'cnn': CnnPolicy, 'gru': GRUPolicy}[args.policy] args.sil_pg_weight_by_value = False args.sil_vf_relu = False args.sil_vf_coef = 0 args.sil_coef = 0 args.sil_ent_coef = 0 args.ent_coef = 0 args.vf_coef = 0 args.cliprange = 1 args.l2_coef = 0 args.adam_epsilon = 1e-8 args.gamma = 0.99 args.lam = 0.10 args.scale_rewards = 1.0 args.sil_weight_success_rate = True args.norm_adv = 1.0 args.log_interval = 1 args.save_interval = 100 args.subtract_rew_avg = True args.clip_rewards = False learn(env, args, True)
def learn(env, network, seed=None, lr=5e-4, scale=255.0, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, grad_norm_clip=10, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model set_global_seeds(seed) input_shape = (1, ) + env.observation_space.shape actions = env.action_space.n q_model = build_q_model(network, input_shape=input_shape, actions=actions, **network_kwargs) actor = Agent(q_model, env.action_space, lr=lr, scale=scale) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model.pth") model_saved = False if os.path.exists(model_file): actor.load(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True now = time.time() for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: epsilon = exploration.value(t) update_param_noise_threshold = 0. action = actor.act(obs, epsilon) else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) action = actor.act_with_param_noise( obs, update_param_noise_threshold=update_param_noise_threshold, update_param_noise_scale=True) # action = act(np.array(obs), update_eps=update_eps, **kwargs) reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None inputs = (obses_t, actions, rewards, obses_tp1, dones, weights) td_errors = actor.update(inputs, double_q=True, gamma=gamma, grad_norm_clip=10) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. actor.sync() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("Speed", t / (time.time() - now)) logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) actor.save(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) actor.load(model_file) return actor
import copy import sys # Use algorithms from baselines from baselines.acktr.acktr_cont import learn from baselines.acktr.policies import GaussianMlpPolicy from baselines.acktr.value_functions import NeuralNetValueFunction from baselines.common import set_global_seeds env = gym.make('GazeboModularScara3DOF-v3') initial_observation = env.reset() print("Initial observation: ", initial_observation) env.render() seed=0 set_global_seeds(seed) env.seed(seed) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500,
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_variables(model_file) return act
def train(env, policy, n_episodes, horizon, seed, njobs=1, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\w+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 if policy == 'linear' or policy == 'nn': def make_policy(name, ob_space, ac_space): return MlpPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy( name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=tf.contrib.layers.xavier_initializer(), output_W_init=tf.contrib.layers.xavier_initializer()) else: raise Exception('Unrecognized policy type.') sampler = ParallelSampler(make_policy, make_env, n_episodes, horizon, True, n_workers=njobs, seed=seed) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois.learn(make_env, make_policy, n_episodes=n_episodes, horizon=horizon, sampler=sampler, **alg_args) sampler.close()
def learn(policy, env, seed, n_steps=20, n_stack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=7e-4, lr_schedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1): """ Train an ACER model. :param policy: (ACERPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param env: (Gym environment) The environment to learn from :param seed: (int) The initial seed for training :param n_steps: (int) The number of steps to run for each environment :param n_stack: (int) The number of stacked frames :param total_timesteps: (int) The total number of samples :param q_coef: (float) Q function coefficient for the loss calculation :param ent_coef: (float) Entropy coefficient for the loss caculation :param max_grad_norm: (float) The maximum value for the gradient clipping :param learning_rate: (float) The learning rate :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param rprop_epsilon: (float) RMS prop optimizer epsilon :param rprop_alpha: (float) RMS prop optimizer decay :param gamma: (float) Discount factor :param log_interval: (int) The number of timesteps before logging. :param buffer_size: (int) The buffer size in number of steps :param replay_ratio: (float) The number of replay learning per on policy learning on average, using a poisson distribution :param replay_start: (int) The minimum number of steps in the buffer, before learning replay :param correction_term: (float) The correction term for the weights :param trust_region: (bool) Enable Trust region policy optimization loss :param alpha: (float) The decay rate for the Exponential moving average of the parameters :param delta: (float) trust region delta value """ print("Running Acer Simple") print(locals()) set_global_seeds(seed) n_envs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_envs=n_envs, n_steps=n_steps, n_stack=n_stack, num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, learning_rate=learning_rate, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lr_schedule=lr_schedule, correction_term=correction_term, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, n_steps=n_steps, n_stack=n_stack) if replay_ratio > 0: buffer = Buffer(env=env, n_steps=n_steps, n_stack=n_stack, size=buffer_size) else: buffer = None n_batch = n_envs * n_steps acer = Acer(runner, model, buffer, log_interval) acer.t_start = time.time() for acer.steps in range( 0, total_timesteps, n_batch ): # n_batch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): samples_number = np.random.poisson(replay_ratio) for _ in range(samples_number): acer.call(on_policy=False) # no simulation steps in this env.close()
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy( env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, beta=beta, total_timesteps=total_timesteps, sil_update=args.sil_update, sil_loss=args.sil_loss, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
def learn(self, total_timesteps, noptepochs=4, seed=None, log_interval=10, save_interval=10): set_global_seeds(seed) total_timesteps = int(total_timesteps) # Calculate the batch_size is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() for update in range(1, total_timesteps): assert self.nbatch % self.nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / total_timesteps # Calculate the learning rate lrnow = self.ppo_model.lr(frac) # Calculate the cliprange cliprangenow = self.ppo_model.cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, actions, values, neglogpacs, epinfos = self.ppo_model.runner.run( ) # pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] # Index of each element of batch_size # Create the indices array inds = np.arange(self.nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, self.nbatch, self.nbatch_train): end = start + self.nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, actions, values, neglogpacs)) mblossvals.append( self.ppo_model.train(lrnow, cliprangenow, *slices)) # TODO: recurrent version # else: # recurrent version # assert self.nenvs % self.nminibatches == 0 # envsperbatch = self.nenvs // self.nminibatches # envinds = np.arange(self.nenvs) # flatinds = np.arange(self.nenvs * self.nsteps).reshape(self.nenvs, self.nsteps) # for _ in range(self.noptepochs): # np.random.shuffle(envinds) # for start in range(0, self.nenvs, envsperbatch): # end = start + envsperbatch # mbenvinds = envinds[start:end] # mbflatinds = flatinds[mbenvinds].ravel() # slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # mbstates = states[mbenvinds] # mblossvals.append(self.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(self.nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.record_tabular("fps", fps) logger.record_tabular( 'eprewmean', safe_mean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( 'eplenmean', safe_mean([epinfo['l'] for epinfo in epinfobuf])) logger.record_tabular("misc/serial_timesteps", update * self.nsteps) logger.record_tabular("misc/nupdates", update) logger.record_tabular("misc/total_timesteps", update * self.nbatch) logger.record_tabular("misc/explained_variance", float(ev)) logger.record_tabular('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, self.ppo_model.loss_names): logger.record_tabular('ppo_loss/' + lossname, lossval) if is_mpi_root: logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and is_mpi_root: file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time())) model_save_path = self.ppo_model.def_path_pre + file_name self.ppo_model.save(model_save_path) return self
def run_hoof_all( network, env, total_timesteps, timesteps_per_batch, # what to train on kl_range, gamma_range, lam_range, # advantage estimation num_kl, num_gamma_lam, cg_iters=10, seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' MPI = None nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # +2 for gamma, lambda ob = tf.placeholder(shape=(None, env.observation_space.shape[0] + 2), dtype=env.observation_space.dtype, name='Ob') with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_ratio = U.function( [ob, ac, atarg], ratio) # IS ratio - used for computing IS weights compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator_with_gl(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' kl_range = np.atleast_1d(kl_range) gamma_range = np.atleast_1d(gamma_range) lam_range = np.atleast_1d(lam_range) while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() thbefore = get_flat() rand_gamma = gamma_range[0] + ( gamma_range[-1] - gamma_range[0]) * np.random.rand(num_gamma_lam) rand_lam = lam_range[0] + ( lam_range[-1] - lam_range[0]) * np.random.rand(num_gamma_lam) rand_kl = kl_range[0] + (kl_range[-1] - kl_range[0]) * np.random.rand(num_kl) opt_polval = -10**8 est_polval = np.zeros((num_gamma_lam, num_kl)) ob_gam_lam = [] tdlamret = [] vpred = [] for gl in range(num_gamma_lam): obgl, vpredbefore, atarg, tdlr = add_vtarg_and_adv_with_gl( pi, seg, rand_gamma[gl], rand_lam[gl]) ob_gam_lam += [obgl] tdlamret += [tdlr] vpred += [vpredbefore] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate pol_ob = np.concatenate( (seg['ob'], np.zeros(seg['ob'].shape[:-1] + (2, ))), axis=-1) args = pol_ob, seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=False) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) surrbefore = lossbefore[0] for m, kl in enumerate(rand_kl): lm = np.sqrt(shs / kl) fullstep = stepdir / lm thnew = thbefore + fullstep set_from_flat(thnew) # compute the IS estimates lik_ratio = compute_ratio(*args) est_polval[gl, m] = wis_estimate(seg, lik_ratio) # update best policy found so far if est_polval[gl, m] > opt_polval: opt_polval = est_polval[gl, m] opt_th = thnew opt_kl = kl opt_gamma = rand_gamma[gl] opt_lam = rand_lam[gl] opt_vpredbefore = vpredbefore opt_tdlr = tdlr meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore expectedimprove = g.dot(fullstep) set_from_flat(thbefore) logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) set_from_flat(opt_th) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) ob_gam_lam = np.concatenate(ob_gam_lam, axis=0) tdlamret = np.concatenate(tdlamret, axis=0) vpred = np.concatenate(vpred, axis=0) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (ob_gam_lam, tdlamret), include_final_partial_batch=False, batch_size=num_gamma_lam * 64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpred, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Opt_KL", opt_kl) logger.record_tabular("gamma", opt_gamma) logger.record_tabular("lam", opt_lam) if rank == 0: logger.dump_tabular() return pi
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) logger.log(tf.trainable_variables()) # Load VGG-m Conv Layer parameters load_vggm_conv('checkpoint/ACT1-4.ckpt') if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() # Dylan, for tensorboard writer = tf.summary.FileWriter(logger.get_dir(), tf.get_default_session().graph) ep_stats = stats( ["Total_timesteps", "EpRewMean", "EpLenMean", "FraRewMean"]) nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) if is_mpi_root: EpRewMean = safemean([epinfo['r'] for epinfo in epinfobuf]) EpLenMean = safemean([epinfo['l'] for epinfo in epinfobuf]) ep_stats.add_all_summary( writer, [update * nbatch, EpRewMean, EpLenMean, EpRewMean / EpLenMean], update) return model
def learn( *, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs): #last_perfm=[] ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "Entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("AverageReturn", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 #if ((total_timesteps % timesteps_per_batch==0) and (total_timesteps//timesteps_per_batch-iters_so_far<50)) or ((total_timesteps % timesteps_per_batch!=0) and (total_timesteps // timesteps_per_batch +1 - iters_so_far < 50)): # last_perfm.append(np.mean(rewbuffer)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() '''logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("AverageReturn", np.mean(np.mean(last_perfm))) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular()''' return pi
def train_mirror(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) env.env.assist_timeout = 100.0 def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array( [0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8, 9, 10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, \ 28, 29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36, 37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52, 58, 57, 59]), action_permutation=np.array( [-6, 7, -8, 9, 10, 11, -0.001, 1, -2, 3, 4, 5, -12, 13, -14, -19, 20, -21, 22, -15, 16, -17, 18])) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) previous_params = None iter_num = 0 last_iter = False # if initialize from previous runs #previous_params = joblib.load('') #env.env.env.assist_schedule = [] joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) reward_threshold = None while True: if not last_iter: rollout_length_thershold = env.env.env.assist_schedule[2][ 0] / env.env.env.dt else: rollout_length_thershold = None opt_pi, rew = pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, init_policy_params=previous_params, reward_drop_bound=500, rollout_length_thershold=rollout_length_thershold, policy_scope='pi' + str(iter_num), return_threshold=reward_threshold, ) if iter_num == 0: reward_threshold = 0.7 * rew if last_iter: reward_threshold = None iter_num += 1 opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val # update the assist schedule for s in range(len(env.env.env.assist_schedule) - 1): env.env.env.assist_schedule[s][1] = np.copy( env.env.env.assist_schedule[s + 1][1]) env.env.env.assist_schedule[-1][1][0] *= 0.75 env.env.env.assist_schedule[-1][1][1] *= 0.75 if env.env.env.assist_schedule[-1][1][0] < 5.0: env.env.env.assist_schedule[-1][1][0] = 0.0 if env.env.env.assist_schedule[-1][1][1] < 5.0: env.env.env.assist_schedule[-1][1][1] = 0.0 zero_assist = True for s in range(len(env.env.env.assist_schedule) - 1): for v in env.env.env.assist_schedule[s][1]: if v != 0.0: zero_assist = False print('Current Schedule: ', env.env.env.assist_schedule) if zero_assist: last_iter = True print('Entering Last Iteration!') env.close()
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network architecture on a given environment using ACER. Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' print("Running Acer Simple") print(locals()) set_global_seeds(seed) if not isinstance(env, VecFrameStack): env = VecFrameStack(env, 1) policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, nsteps=nsteps) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size) else: buffer = None nbatch = nenvs*nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this return model
def learn( network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, save_interval=100, load_path=None, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 # print(env.action_space.shape) # print(env.action_space.n) # nb_actions = env.action_space.shape[-1] continuous_ctrl = not isinstance(env.action_space, spaces.Discrete) nb_actions = env.action_space.shape[ -1] if continuous_ctrl else env.action_space.n # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. action_shape = env.action_space.shape if continuous_ctrl else ( nb_actions, ) # print(env.action_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high max_action = 1 logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) if load_path is not None: load_path = "{}/{}/checkpoints/checkpoints-final".format( Config.results_dir, load_path) load_path = osp.expanduser(load_path) agent.load(load_path) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = 0.0 epoch_episode_steps = 0.0 epoch_actions = 0.0 epoch_qs = 0.0 epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # print(action) # print(np.argmax(max_action * action, axis=1)) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch action_step = action if continuous_ctrl else np.argmax( max_action * action, axis=1) new_obs, r, done, info = env.step( action_step ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv # print(new_obs) # print(r) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. # epoch_actions.append(action_step) epoch_actions += sum(action_step) epoch_qs += q agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards += episode_reward[d] episode_rewards_history.append(episode_reward[d]) epoch_episode_steps += episode_step[d] episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_action_step = eval_action if continuous_ctrl else np.argmax( max_action * eval_action, axis=1) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action_step ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats[ Config.tensorboard_rootdir + 'rollout/return'] = epoch_episode_rewards / float(episodes) combined_stats[Config.tensorboard_rootdir + 'rollout/return_history'] = np.mean( episode_rewards_history) combined_stats[ Config.tensorboard_rootdir + 'rollout/episode_steps'] = epoch_episode_steps / float(episodes) combined_stats[Config.tensorboard_rootdir + 'rollout/actions_mean'] = epoch_actions / float(t) combined_stats[Config.tensorboard_rootdir + 'rollout/Q_mean'] = epoch_qs / float(t) combined_stats[Config.tensorboard_rootdir + 'train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats[Config.tensorboard_rootdir + 'train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats[Config.tensorboard_rootdir + 'train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats[Config.tensorboard_rootdir + 'total/duration'] = duration combined_stats[Config.tensorboard_rootdir + 'total/steps_per_second'] = float(t) / float(duration) combined_stats[Config.tensorboard_rootdir + 'total/episodes'] = episodes combined_stats[Config.tensorboard_rootdir + 'rollout/episodes'] = epoch_episodes # combined_stats[Config.tensorboard_rootdir+'rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats[Config.tensorboard_rootdir + 'eval/return'] = eval_episode_rewards combined_stats[Config.tensorboard_rootdir + 'eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats[Config.tensorboard_rootdir + 'eval/Q'] = eval_qs combined_stats[Config.tensorboard_rootdir + 'eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = np.array( [np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats[Config.tensorboard_rootdir + 'total/epochs'] = epoch + 1 combined_stats[Config.tensorboard_rootdir + 'total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') # logdir = logger.get_dir() # if rank == 0 and logdir: # if hasattr(env, 'get_state'): # with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: # pickle.dump(env.get_state(), f) # if eval_env and hasattr(eval_env, 'get_state'): # with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: # pickle.dump(eval_env.get_state(), f) if save_interval and (epoch % save_interval == 0 or epoch == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % epoch) print('Saving to', savepath) agent.save(savepath) return agent
logdir = '/tmp/ros2learn/' + defaults['env_name'] + '/ppo2_lstm_results/' finally: logger.configure( os.path.abspath(logdir) ) csvdir = logdir + "/csv/" csv_files = [csvdir + "det_obs.csv", csvdir + "det_acs.csv", csvdir + "det_rew.csv" ] if not os.path.exists(csvdir): os.makedirs(csvdir) else: for f in csv_files: if os.path.isfile(f): os.remove(f) env = DummyVecEnv([make_env]) set_global_seeds(defaults['seed']) if isinstance(defaults['lr'], float): defaults['lr'] = constfn(defaults['lr']) else: assert callable(defaults['lr']) if isinstance(defaults['cliprange'], float): defaults['cliprange'] = constfn(defaults['cliprange']) else: assert callable(defaults['cliprange']) alg_kwargs ={ 'nlstm': defaults['nlstm'], 'layer_norm': defaults['layer_norm'] } policy = build_policy(env, defaults['network'], **alg_kwargs) nenvs = env.num_envs ob_space = env.observation_space
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, begin_iter=40, model_train=4, random_initial_ratio=0, exponent=1.0, K=0, beta=1, alpha=0.1, index_type='min', reward_freq=40, r_ex_coef=1, **network_kwargs): ''' # K=0: Original PPO; K=1: Single Surprise # index_type= 'min', 'max', 'ens', 'avg' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) np.set_printoptions(precision=3) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = network dynamics = build_dynamics(env, 'mlp', **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() if load_path is not None: model.load(load_path) # Instantiate the dynamics class object dynamics_class = Dynamics(dynamics=dynamics, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps,K=K) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, dynamics=dynamics_class, index_type=index_type, ex_coef=r_ex_coef, beta=beta, reward_freq=reward_freq) if eval_env is not None: eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam, dynamics=dynamics_class, index_type=index_type, ex_coef=r_ex_coef, beta=beta, reward_freq=reward_freq) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) for _ in range(begin_iter): runner.run_begin() # Start total timer tfirststart = time.time() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 logger.log("********** Iteration %i ************" % update) ### random_initial_ratio = 0 -> no random selection ### 0 <= random_initial_ratio < 1 if update >= int(random_initial_ratio*nupdates): runner.flag = False ### Store old parameters of every model runner.old_weights = runner.weights ### Store old parameters of every model; and store new value to old one; For later calculation model_old_weights_list = [] for index in range(K): param_temp = dynamics_class.MDP_get_flat[index]() model_old_weights_list.append(param_temp) ### Store new parameters of every model model_new_weights_list = [] ### Store training sets for test #dyn_batch_list = [] for index in range(K): ## We do not have to ramdomly order dyn_batch = runner.replay_memory.sample(batch_size=nbatch) ## Every model should train with different batches #dyn_batch_list.append(dyn_batch) dynamics_inds = np.arange(nbatch) for _ in range(model_train): # Randomize the indexes np.random.shuffle(dynamics_inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): ## nabtch_train = 64 end = start + nbatch_train dynamics_mbinds = dynamics_inds[start:end] dynamics_slices = (arrr[dynamics_mbinds] for arrr in (dyn_batch['obs0'], dyn_batch['actions'], dyn_batch['obs1'])) dynamics_class.dynamics_train(index, *dynamics_slices, runner.weights, alpha) Param_Temp = dynamics_class.MDP_get_flat[index]() model_new_weights_list.append(Param_Temp) dynamics_class.MDP_set_from_flat[index](model_old_weights_list[index]) if K >= 2: # K = 2, 3, 4, ... q_batch = runner.replay_memory.sample(batch_size=nbatch) dynamics_inds = np.arange(nbatch) for _ in range(model_train): # Randomize the indexes np.random.shuffle(dynamics_inds) # 0 to batch_size with batch_train_size step for start2 in range(0, nbatch, nbatch_train): ## nabtch_train = 64 end2 = start2 + nbatch_train dynamics_mbinds = dynamics_inds[start2:end2] q_slices = (arrry[dynamics_mbinds] for arrry in (q_batch['obs0'], q_batch['actions'], q_batch['obs1'])) runner.weights = dynamics_class.weight_train(runner.weights, K, *q_slices, nbatch_train, update, nupdates, exponent) ### Setting new parameters simultaneously for index in range(K): dynamics_class.MDP_set_from_flat[index](model_new_weights_list[index]) dynamics_class.old_MDP_set_from_flat[index](model_old_weights_list[index]) ### To make actual 1-step surprise # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # print("slices", slices) # print("*slices", *slices) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) #print("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) return model
def train_mirror(args, num_timesteps): from baselines.ppo1 import mlp_mirror_policy, mlp_mirror_norms_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env) env.env._seed(args.seed + MPI.COMM_WORLD.Get_rank()) env.env.init_params(args) U.ALREADY_INITIALIZED = set() U.ALREADY_INITIALIZED.update(set(tf.global_variables())) obs_per = np.array([ 0.0001, -1, 2, -3, -4, 11, 12, 13, 14, 15, 16, 5, 6, 7, 8, 9, 10, -17, 18, -19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, -30, 31, -32, -33, 40, 41, 42, 43, 44, 45, 34, 35, 36, 37, 38, 39, -46, 47, -48, 53, 54, 55, 56, 49, 50, 51, 52 ]) if env.env.include_additional_info: obs_per = np.concatenate((obs_per, np.array([58, 57]))) obs_per = np.concatenate((obs_per, np.array([59]))) obs_per = np.concatenate((obs_per, np.array([63, 64, -65, 60, 61, -62]))) obs_per = np.concatenate((obs_per, np.array([66, 67, -68]))) obs_per = np.concatenate((obs_per, np.array([72, 73, -74, 69, 70, -71]))) obs_per = np.concatenate((obs_per, np.array([75, 76, -77]))) obs_per = np.concatenate((obs_per, np.array([78, 79, -80]))) assert env.env.obs_dim == (57 + 3 + 3 * 6 + 3) assert env.env.act_dim == 21 # change action/state permutation if change action/state in env def policy_fn(name, ob_space, ac_space): if env.env.env.state_self_standardize: return mlp_mirror_norms_policy.MlpMirrorNormsPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hsize, num_hid_layers=args.layers, gmm_comp=1, mirror_loss=True, observation_permutation=obs_per, action_permutation=np.array([ 5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18, 19, 20, 13, 14, 15, 16 ])) else: return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hsize, num_hid_layers=args.layers, gmm_comp=1, mirror_loss=True, observation_permutation=obs_per, action_permutation=np.array([ 5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18, 19, 20, 13, 14, 15, 16 ])) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) with open(logger.get_dir() + '/env_specs.txt', 'w') as f: pprint.pprint(env.env.env.__dict__, f) f.close() shutil.copyfile(env.env.env.model_file_name, logger.get_dir() + '/using_model.skel') cur_sym_loss = 1.0 iter_num = 0 previous_params = None # previous_params = joblib.load('') reward_threshold = None rollout_length_threshold = None pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2000), clip_param=args.clip, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=cur_sym_loss, init_policy_params=previous_params, reward_drop_bound=None, rollout_length_threshold=rollout_length_threshold, policy_scope='pi' + str(iter_num), return_threshold=reward_threshold, ) env.close()
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='DartWalker3d-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument( '--init_policy', help='Initial Policy', default= 'data/ppo_DartWalker3d-v1303_energy04_vel15_mirror4_velrew3_asinput_curriculum/policy_params.pkl' ) parser.add_argument('--init_curriculum', help='Initial Curriculum', nargs='+', default=[2000.0, 1000]) parser.add_argument( '--ref_policy', help='Reference Policy', default= 'data/ppo_DartWalker3d-v1303_energy04_vel15_mirror4_velrew3_asinput_curriculum/policy_params.pkl' ) parser.add_argument('--ref_curriculum', help='Reference Curriculum', nargs='+', default=[2000.0, 1000]) parser.add_argument('--anc_thres', help='Anchor Threshold', type=float, default=0.85) parser.add_argument('--prog_thres', help='Progress Threshold', type=float, default=0.6) parser.add_argument('--batch_size', help='Batch Size', type=int, default=2500) parser.add_argument('--max_iter', help='Maximum Iteration', type=int, default=2000) parser.add_argument('--use_reftraj', help='Use reference trajectory', type=int, default=0) args = parser.parse_args() logger.reset() logger.configure('data/ppo_curriculum_150eachit_vel15_runningavg3_e04_' + args.env + '_' + str(args.seed) + '_' + str(args.anc_thres) + '_' + str(args.prog_thres) + '_' + str(args.batch_size)) sess = U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env) ob_space = env.observation_space ac_space = env.action_space def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=np.array([ 0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8, -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28, 35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41, 43 ]), action_permutation=np.array([ -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8 ])) policy = policy_fn('policy', ob_space, ac_space) init_curriculum = np.array(args.init_curriculum) ref_policy = policy_fn('ref_policy', ob_space, ac_space) ref_curriculum = np.array(args.ref_curriculum) policy_params = joblib.load(args.init_policy) ref_policy_params = joblib.load(args.ref_policy) U.initialize() cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0]. name.find('/')] orig_scope = list( policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')] ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params. keys())[0].find('/')] for i in range(len(policy.get_variables())): assign_op = policy.get_variables()[i].assign( policy_params[policy.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) sess.run(assign_op) assign_op = ref_policy.get_variables()[i].assign( ref_policy_params[ref_policy.get_variables()[i].name.replace( 'ref_' + cur_scope, ref_scope, 1)]) sess.run(assign_op) anchor_threshold = args.anc_thres progress_threshold = args.prog_thres env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) curriculum_evolution = [] env.env.env.anchor_kp = ref_curriculum ref_score = None ref_max_score = None reference_trajectory = None if MPI.COMM_WORLD.Get_rank() == 0: if args.use_reftraj == 1: reference_trajecotry = gen_reftraj(env, ref_policy, 299) env.env.reference_trajectory = reference_trajectory ref_score, ref_max_score = evaluate_policy(env, ref_policy, 20) ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0) ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0) reference_score = ref_score * progress_threshold reference_anchor_score = ref_score * anchor_threshold reference_max_score = ref_max_score * 0.9 env.env.env.anchor_kp = init_curriculum reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory current_curriculum = np.copy(init_curriculum) print('reference scores: ', reference_score, reference_anchor_score, reference_max_score) previous_params = policy_params for iter in range(args.max_iter): print('curriculum iter ', iter) print('ref score: ', reference_anchor_score) opt_pi, final_rew = pposgd_mirror.learn( env, policy_fn, max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150, timesteps_per_batch=int(args.batch_size), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=2.0, return_threshold=reference_anchor_score, init_policy_params=previous_params, policy_scope='pi' + str(iter), min_iters=0, reward_drop_bound=True, #max_threshold = reference_max_score, ) print('one learning iteration done') if np.linalg.norm(current_curriculum) >= 0.0001: # re-compute reference trajectory if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1: print('recompute reference traj') reference_trajecotry = gen_reftraj(env, opt_pi, 299) reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0) env.env.reference_trajectory = reference_trajectory if final_rew < reference_anchor_score * 0.95: print('update reference scores') reference_score = reference_score / reference_anchor_score * final_rew reference_anchor_score = final_rew closest_candidate = None if MPI.COMM_WORLD.Get_rank() == 0: directions = [ np.array([-1, 0]), np.array([0, -1]), -current_curriculum / np.linalg.norm(current_curriculum) ] int_d1 = directions[0] + directions[2] int_d2 = directions[1] + directions[2] directions.append(int_d1 / np.linalg.norm(int_d1)) directions.append(int_d2 / np.linalg.norm(int_d2)) #directions = [np.array([0.0, -1.0])] # only search in one direction candidate_next_anchors = [] for direction in directions: found_point, perf = binary_search_curriculum( env, opt_pi, current_curriculum, direction, reference_score, reference_max_score, 6) print(direction, found_point, perf) candidate_next_anchors.append(found_point) if closest_candidate is None: closest_candidate = np.copy(found_point) elif np.linalg.norm(closest_candidate) > np.linalg.norm( found_point): closest_candidate = np.copy(found_point) if np.linalg.norm(closest_candidate) < 0.5: closest_candidate = np.array([0, 0]) if np.abs(closest_candidate[0]) < 0.5: closest_candidate[0] = 0.0 if np.abs(closest_candidate[1]) < 0.5: closest_candidate[1] = 0.0 closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0) current_curriculum = np.copy(closest_candidate) env.env.env.anchor_kp = current_curriculum '''print('Update Init Pose Distributions') update_init_poses(env, opt_pi) if MPI.COMM_WORLD.Get_rank() == 0: joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True) joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)''' curriculum_evolution.append(current_curriculum) print('Current curriculum: ', current_curriculum) opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val if np.linalg.norm(current_curriculum) < 0.0001: if reference_anchor_score < ref_score: reference_anchor_score = ref_score else: break env.close()
def launch(env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs, structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: save_dir = find_save_path('./save/' + env + "/", trial_id) logger.configure(dir=save_dir) else: save_dir = None # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params, add main function arguments and log all parameters if structure == 'curious' or structure == 'task_experts': params = config.MULTI_TASK_PARAMS else: params = config.DEFAULT_PARAMS time = str(datetime.datetime.now()) params['time'] = time params['env_name'] = env params['task_selection'] = task_selection params['goal_selection'] = goal_selection params['task_replay'] = task_replay params['goal_replay'] = goal_replay params['structure'] = structure params['normalize_obs'] = normalize_obs params['num_cpu'] = num_cpu params['clip_return'] = clip_return params['trial_id'] = trial_id params['seed'] = seed if rank == 0: with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['ddpg_params']['normalize_obs'] = normalize_obs if rank == 0: config.log_params(params, logger=logger) if num_cpu != 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) buffers = config.configure_buffer(dims=dims, params=params) # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy if structure == 'task_experts': policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])] else: policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'structure': structure, 'task_selection': task_selection, 'goal_selection': goal_selection, 'queue_length': params['queue_length'], 'eval': False, 'eps_task': params['eps_task'] } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'structure' : structure, 'task_selection': task_selection, 'goal_selection' : goal_selection, 'queue_length': params['queue_length'], 'eval': True, } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] if structure == 'task_experts': # create one rollout worker per policy/task rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])] for i in range(params['nb_tasks']): rollout_worker[i].seed(rank_seed + i) else: rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed + 100) train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb, policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
def train(sess, env_id, num_timesteps, timesteps_per_actor, autoencoders, seed): from baselines.ppo1 import pposgd_novelty, mlp_policy_novelty, pposgd_novelty_projection, mlp_policy_mirror_novelty, \ pposgd_mirror_novelty, pposgd_mirror_novelty_projection rank = MPI.COMM_WORLD.Get_rank() workerseed = seed * 50 + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): # pylint: disable=W0613 return mlp_policy_mirror_novelty.MlpPolicyMirrorNovelty( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, mirror_loss=True, observation_permutation=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 11, 12, 13, 14, 23, 24, 25, 26, 19, 20, 21, 22 ], action_permutation=[4, 5, 6, 7, 0, 1, 2, 3]) env.env.novel_autoencoders = autoencoders env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(seed + rank) if len(autoencoders) == 0: print("NO AUTOENCODER!!") model = pposgd_novelty.learn( env, policy_fn, max_timesteps=num_timesteps, # max_iters=30, timesteps_per_actorbatch=timesteps_per_actor, clip_param=0.2, entcoeff=0, optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, # Following params are kwargs session=sess, gap=5, sym_loss_weight=1, ) else: print("AUTOENCODER LENGTH: ", len(autoencoders)) model = pposgd_novelty_projection.learn( env, policy_fn, max_timesteps=num_timesteps, # max_iters=30, timesteps_per_actorbatch=timesteps_per_actor, clip_param=0.2, entcoeff=0, optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, # Following params are kwargs session=sess, gap=5, sym_loss_weight=1, ) env.close() return model
def main(): args = setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() seed = int(time.time()) % 10000 utils.mpi_print(seed * 100 + rank) set_global_seeds(seed * 100 + rank) # For wandb package to visualize results curves config = Config.get_args_dict() config['global_seed'] = seed wandb.init(name=config["run_id"], project="coinrun", notes=" GARL generate seed", tags=["try"], config=config) utils.setup_mpi_gpus() utils.mpi_print('Set up gpu', args) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 eval_limit = Config.EVAL_STEP * 10**6 phase_eval_limit = int(eval_limit // Config.TRAIN_ITER) total_timesteps = int(Config.TOTAL_STEP * 10**6) phase_timesteps = int((total_timesteps - eval_limit) // Config.TRAIN_ITER) with tf.Session(config=config): sess = tf.get_default_session() # init env nenv = Config.NUM_ENVS env = make_general_env(nenv, rand_seed=seed) utils.mpi_print('Set up env') policy = policies_back.get_policy() utils.mpi_print('Set up policy') optimizer = SeedOptimizer(env=env, logdir=Config.LOGDIR, spare_size=Config.SPA_LEVELS, ini_size=Config.INI_LEVELS, eval_limit=phase_eval_limit, train_set_limit=Config.NUM_LEVELS, load_seed=Config.LOAD_SEED, rand_seed=seed, rep=1, log=True) step_elapsed = 0 t = 0 if args.restore_id is not None: datapoints = Config.get_load_data('default')['datapoints'] step_elapsed = datapoints[-1][0] optimizer.load() seed = optimizer.hist[-1] env.set_seed(seed) t = 16 print('loadrestore') Config.RESTORE_ID = Config.get_load_data( 'default')['args']['run_id'] Config.RUN_ID = Config.get_load_data( 'default')['args']['run_id'].replace('-', '_') while (step_elapsed < (Config.TOTAL_STEP - 1) * 10**6): # ============ GARL ================= # optimize policy mean_rewards, datapoints = learn_func( sess=sess, policy=policy, env=env, log_interval=args.log_interval, save_interval=args.save_interval, nsteps=Config.NUM_STEPS, nminibatches=Config.NUM_MINIBATCHES, lam=Config.GAE_LAMBDA, gamma=Config.GAMMA, noptepochs=Config.PPO_EPOCHS, ent_coef=Config.ENTROPY_COEFF, vf_coef=Config.VF_COEFF, max_grad_norm=Config.MAX_GRAD_NORM, lr=lambda f: f * Config.LEARNING_RATE, cliprange=lambda f: f * Config.CLIP_RANGE, start_timesteps=step_elapsed, total_timesteps=phase_timesteps, index=t) # test catestrophic forgetting if 'Forget' in Config.RUN_ID: last_set = list(env.get_seed_set()) if t > 0: curr_set = list(env.get_seed_set()) last_scores, _ = eval_test(sess, nenv, last_set, train=True, idx=None, rep_count=len(last_set)) curr_scores, _ = eval_test(sess, nenv, curr_set, train=True, idx=None, rep_count=len(curr_set)) tmp = set(curr_set).difference(set(last_set)) mpi_print("Forgetting Exp") mpi_print("Last setsize", len(last_set)) mpi_print("Last scores", np.mean(last_scores), "Curr scores", np.mean(curr_scores)) mpi_print("Replace count", len(tmp)) # optimize env step_elapsed = datapoints[-1][0] if t < Config.TRAIN_ITER: best_rew_mean = max(mean_rewards) env, step_elapsed = optimizer.run(sess, env, step_elapsed, best_rew_mean) t += 1 save_final_test = True if save_final_test: final_test = {} final_test['step_elapsed'] = step_elapsed train_set = env.get_seed() final_test['train_set_size'] = len(train_set) eval_log = eval_test(sess, nenv, train_set, train=True, is_high=False, rep_count=1000, log=True) final_test['Train_set'] = eval_log eval_log = final_test(sess, nenv, None, train=False, is_high=True, rep_count=1000, log=True) final_test['Test_set'] = eval_log joblib.dump(final_test, setup_utils.file_to_path('final_test')) env.close()
def learn(network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, actor_l2_reg=0.0, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=1000, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, rb_size=1e6, save_interval=5, # reduce memory footprint bc_epochs=0, load_path=None, demos_path=None, bc_teacher_lambda=0.0, use_qfilter=False, **network_kwargs): """Learns policy using imitation (maybe DAgger) w/vectorized environments. If we pass other arguments that aren't specified here, they are considered as network_kwargs. Parameters ---------- noise_type: for noise to be added to the behavior policy. They are NOT using the noise type from the paper but 'AdaptiveParamNoiseSpec'. I _think_ that if one does the OU process, we get action noise, but not parameter noise. Also, be sure to use `name_stdev` in that convention, as the code will split the argument at the underscores. actor_lr: 1e-4 (matches paper) critic_lr: 1e-3 (matches paper) critic_l2: 1e-2 (matches paper) gamma: 0.99 (matches paper) batch_size: 64 (matches paper for lower-dim env obs/states) tau: 0.01 for soft target updates of actor and critic nets. Paper used 0.001. nb_epoch_cycles: number of times we go through this cycle of: (1) get rollouts with noise added to policy and apply to replay buffer, (2) gradient updates for actor/critic, (3) evaluation rollouts (if any). AFTER all of these cycles happen, THEN we log statistics. nb_rollout_steps: number of steps in each parallel env we take with exploration policy without training, so this is just to populate the replay buffer. More parallel envs *should* mean that we get more samples in the buffer between each gradient updates of the network, so this might need to be environment *and* machine (# of CPUs) specific. nb_train_steps: after doing `nb_rollout_steps` in each parallel env, we do this many updates; each involves sampling from the replay buffer and updating the actor and critic (via lagged target updates). nb_eval_steps: 1000, I changed from the 100 as default. Using 1000 ensures that fixed length envs like Ant-v2 can get one full episode (assuming one parallel env) during evaluation stagtes. eval_env: A separate environment for evaluation only, where no noise is applied, similar to how rlkit does it. save_interval: Frequency between saving. """ set_global_seeds(seed) # Daniel: NOTE/TODO testing, if I can. USE_KERAS = False # Daniel: should be False unless I'm doing some testing. do_valid_tests = False # Daniel: this helps to maintain compatibility with PPO2 code. For now # we're ignoring it, but we should check that we're always clipping. I # changed the nb_epochs to match with PPO2 in that we divide by nenvs. if 'limit_act_range' in network_kwargs: network_kwargs.pop('limit_act_range') nenvs = env.num_envs nbatchsize = nenvs * nb_epoch_cycles * nb_rollout_steps if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // nbatchsize else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 # we assume symmetric actions. nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high).all() # Form XP (1M steps, same as in paper), and ONLY ACTOR net, no critic. # Daniel: force dtype here so we can use uint8 type images. assert env.observation_space.low.dtype == env.observation_space.high.dtype # Also changing to (100,100) unless we do keras/pretraining, to force smaller images. if USE_KERAS: obs_shape = env.observation_space.shape else: obs_shape = (100,100,4) memory = Memory(limit=int(rb_size), action_shape=env.action_space.shape, observation_shape=obs_shape, dtype=env.observation_space.low.dtype, do_valid=do_valid_tests) actor = Actor(nb_actions, network=network, use_keras=USE_KERAS, **network_kwargs) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) # The `learn` defaults above have priority over defaults in IMIT class. agent = IMIT(actor=actor, memory=memory, observation_shape=obs_shape, action_shape=env.action_space.shape, batch_size=batch_size, actor_l2_reg=actor_l2_reg, actor_lr=actor_lr, use_keras=USE_KERAS) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Prepare everything. sess = U.get_session() agent.initialize(sess) # -------------------------------------------------------------------------- # Daniel: similar as PPO2 code as `agent` is similar to `model` but has to # be initialized explicitly above. Must call after `agent.load` gets # created. Not sure if this works with parameter space noise or with # normalization, but I don't plan to resume training (for now). It also has # to be *before* the `graph.finalize()` because otherwise we get an error. # -------------------------------------------------------------------------- if load_path is not None: logger.info("\nInside IMIT, loading model from: {}".format(load_path)) agent.load(load_path) # -------------------------------------------------------------------------- sess.graph.finalize() # -------------------------------------------------------------------------- # Daniel: populate replay buffer, followed by behavior cloning stage. # But if load_path is not None, then doesn't make sense -- we want to load. # We also don't need to do this if timesteps is 0 (e.g., for playing policy). # -------------------------------------------------------------------------- # OK now we're assuming we do DAgger by default. If we don't want to do # DAgger (i.e., pure BC) then let's set time_steps=1 for training. # -------------------------------------------------------------------------- if total_timesteps == 0: return agent assert seed == 1500, 'We normally want seed 1500, yet: {}'.format(seed) if (demos_path is not None and load_path is None): _ddpg_demos(demos_path, agent, memory, algo='IMIT') assert memory.nb_entries == memory.nb_teach_entries, memory.nb_entries if do_valid_tests: memory.set_valid_idx() checkdir = osp.join(logger.get_dir(), 'checkpoints') statsdir = osp.join(logger.get_dir(), 'pretrain_stats') os.makedirs(checkdir, exist_ok=True) os.makedirs(statsdir, exist_ok=True) # Pretrain, based on their training code for some # of minibatches. pt_actor_losses = [] pt_actor_losses_l2 = [] batches_per_ep = int(memory.nb_entries / batch_size) logger.info('Running BC for {} epochs'.format(bc_epochs)) logger.info(' data size in memory: {}'.format(memory.nb_entries)) logger.info(' each batch: {}, epoch mbs: {}'.format(batch_size, batches_per_ep)) if do_valid_tests: logger.info(' memory valid idx: {}'.format(memory.nb_valid_items)) pt_start = time.time() for epoch in range(1,bc_epochs+1): losses = [] # includes L2 fyi losses_l2 = [] for _ in range(batches_per_ep): al, al_l2 = agent.train(return_l2=True) losses.append(al) losses_l2.append(al_l2) pt_actor_losses.append( np.mean(losses) ) pt_actor_losses_l2.append( np.mean(losses_l2) ) # Check and save model occasionally. if epoch == 1 or epoch % 10 == 0: pt_time = (time.time() - pt_start) / 60. logger.info(' epoch done: {}, loss over past epoch: {:.4f} (l2: {:.4f})'.format( str(epoch).zfill(4), pt_actor_losses[-1], pt_actor_losses_l2[-1])) logger.info(' elapsed time: {:.1f}m'.format(pt_time)) savepath = osp.join(checkdir, 'bc_epoch_{}'.format(str(epoch).zfill(4))) logger.info('Saving model checkpoint to: ', savepath) agent.save(savepath) # Do validation here. if do_valid_tests: num_mbs = int(memory.nb_valid_items / batch_size) l2_errors = [] for mb in range(num_mbs): res = memory.get_valid_obs(mb*batch_size, (mb+1)*batch_size) valid_obs = res['obs0'] valid_act = res['actions'] assert valid_obs.shape == (batch_size,100,100,3), valid_obs.shape acts, _, _, _ = agent.step(obs=valid_obs, apply_noise=False) l2_err_vector = np.mean((valid_act - acts)**2, axis=1) l2_errors.extend(l2_err_vector) # Last minibatch res = memory.get_valid_obs((mb+1)*batch_size, memory.nb_valid_items) valid_obs = res['obs0'] valid_act = res['actions'] acts, _, _, _ = agent.step(obs=valid_obs, apply_noise=False) l2_err_vector = np.mean((valid_act - acts)**2, axis=1) l2_errors.extend(l2_err_vector) l2_err_valid = np.mean(l2_errors) logger.log(' VALIDATION L2 error: {:.4f}'.format(l2_err_valid)) pt_time = (time.time() - pt_start) / 60. logger.info('losses a: {}'.format(np.array(pt_actor_losses))) logger.info('losses a (l2 norm of weights): {}'.format(np.array(pt_actor_losses_l2))) losses_pth = osp.join(statsdir, 'bc_losses.pkl') losses_l2_pth = osp.join(statsdir, 'bc_losses_l2_only.pkl') with open(losses_pth, 'wb') as fh: pickle.dump(losses_pth, fh) with open(losses_l2_pth, 'wb') as fh: pickle.dump(losses_l2_pth, fh) logger.info('Finished BC (no DAgger) in {:.1f}m.\n'.format(pt_time)) # -------------------------------------------------------------------------- # Back to their code. For cloth, `env.reset()` takes a while so we put it here. obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] # Daniel: this is how to get the cloth state for the teacher; depends on num_env. logger.info('nenvs={}, getting cloth points + teacher policy...'.format(nenvs)) if nenvs == 1: # DummyVecEnv has an `envs` "list". Then an extra `.env` to get ClothEnv. cloth_env = (env.envs[0]).env pts = (cloth_env).cloth.pts logger.info('singleton env type: {}'.format(cloth_env)) logger.info('len(points): {}'.format(len(pts))) teacher = OracleCornerPolicy() teacher.set_env_data_dummy(cloth_env) logger.info('teacher attributes: {}'.format(teacher.get_info())) teacher_list = [ teacher ] else: # SubprocVecEnv, not sure if we can obtain envs or if it's safe, so I did this. env_attr = env.get_cloth_attributes() logger.info('env attributes: {}'.format(env_attr)) teacher_list = [] assert len(env_attr) == nenvs, len(env_attr) for env_a in env_attr: teacher = OracleCornerPolicy() teacher.set_env_data_subproc(env_a[0], env_a[1], env_a[2]) teacher_list.append(teacher) # Daniel: Debugging/sanity checks. _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) U.display_var_info(_variables) logger.info("\nInside IMIT, about to start epochs") logger.info("nbatchsize: {}, get this in buffer before IMIT updates".format(nbatchsize)) logger.info(" i.e.: (nenv {}) * (cycles {}) * (nsteps {})".format( nenvs, nb_epoch_cycles, nb_rollout_steps)) logger.info("nb_epochs: {}, number of cycles to use".format(nb_epochs)) logger.info("eval_env None? {}".format(eval_env is None)) logger.info("(end of debugging messages)\n") # File paths. checkdir = osp.join(logger.get_dir(), 'checkpoints') action_dir = osp.join(logger.get_dir(), 'actions') episode_dir = osp.join(logger.get_dir(), 'ep_all_infos') os.makedirs(checkdir, exist_ok=True) os.makedirs(action_dir, exist_ok=True) os.makedirs(episode_dir, exist_ok=True) # Daniel: use these two to store past 100 episode history. Report these stats! eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) all_eval_episode_rewards = [] # reward/step: cumulative quantities for each episode in vecenv. # epoch_{actions,qs} will grow without bound, fyi. episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_episodes = 0 for epoch in range(nb_epochs): mb_actions = [] mb_epinfos = [] for cycle in range(nb_epoch_cycles): # Daniel: pure data collection (NO noise added) to populate replay buffer. # No training until after this, and note the parallel stepping (VecEnv). for t_rollout in range(nb_rollout_steps): # Predict next action: (#_parallel_envs, ac_dim). action, _, _, _ = agent.step(obs) if rank == 0 and render: env.render() # Before environment stepping happens, we need to run the teacher # policy here, because it needs the SAME EXACT env state. The policy # does not apply on the obs but the INTERNAL env.cloth.pts. t_actions = [] if nenvs > 1: cloth_objs = env.get_cloth_objs() for teacher, cloth in zip(teacher_list, cloth_objs): t_act = teacher.get_action(cloth=cloth) t_actions.append(t_act) else: for teacher in teacher_list: t_act = teacher.get_action() t_actions.append(t_act) t_actions = np.array(t_actions) t_actions = np.maximum( np.minimum(t_actions,1.0), -1.0) logger.info('agent actions:\n{}'.format(action)) logger.info('teacher actions:\n{}'.format(t_actions)) logger.info('L2 diff: \n{:.4f}'.format( np.mean( np.mean((action-t_actions)**2, axis=1) ) )) # max_action is of dimension A, whereas action is dimension # (nenvs, A) - the multiplication gets broadcasted to the batch # scale for execution in env (as far as DDPG is concerned, # every action is in [-1, 1]) new_obs, r, done, info = env.step(max_action * action) r = r.astype(np.float32) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 epoch_actions.append(action) # Daniel: Same as PPO2/DDPG, just checking for end of episodes. mb_actions.append(action) for inf in info: maybeepinfo = inf.get('episode') if maybeepinfo: mb_epinfos.append(inf) # The batched data will be unrolled in memory.py's append. Daniel: # unlike DDPG, we only need obs/act, but act is FROM THE EXPERT. # Unfortunately there will be duplicate obs/act pairs if the student # actions don't touch the cloth, but I'm not sure how to avoid that. agent.store_transition(obs, t_actions) obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) # Entire history episode_rewards_history.append(episode_reward[d]) # Last 100 only epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 # Train. epoch_actor_losses = [] epoch_actor_losses_l2 = [] for t_train in range(nb_train_steps): al, al_l2 = agent.train(return_l2=True) epoch_actor_losses.append(al) epoch_actor_losses_l2.append(al_l2) if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time combined_stats = {} combined_stats['memory/nb_entries'] = memory.nb_entries combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_actor_l2'] = np.mean(epoch_actor_losses_l2) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) # Total statistics. combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps_per_env'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) # Daniel: we can use cycle or epoch for this if condition ... kind of annoying but w/e. if cycle % save_interval == 0: logger.info('We are now saving stuff!!') savepath = osp.join(checkdir, '%.5i'%epoch) logger.info('Saving model checkpoint to: ', savepath) agent.save(savepath) # ------------------------------------------------------------------ # Daniel: extra stuff for debugging. mb_actions = _sf01(np.asarray(mb_actions)) act_savepath = osp.join(action_dir, 'actions_%.5i.pkl'%epoch) epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl'%epoch) with open(act_savepath, 'wb') as fh: pickle.dump(mb_actions, fh) with open(epi_savepath, 'wb') as fh: pickle.dump(mb_epinfos, fh) # Daniel: we were not resetting earlier. Actually there are other # epoch_stats which we might consider resetting here? epoch_episodes = 0 return agent
def __init__(self, agent, network, nsteps, rho, max_kl, ent_coef, vf_stepsize, vf_iters, cg_damping, cg_iters, seed, load_path, **network_kwargs): super(AgentModel, self).__init__(name='MATRPOModel') self.agent = agent self.nsteps = nsteps self.rho = rho self.max_kl = max_kl self.ent_coef = ent_coef self.cg_damping = cg_damping self.cg_iters = cg_iters self.vf_stepsize = vf_stepsize self.vf_iters = vf_iters set_global_seeds(seed) np.set_printoptions(precision=3) if MPI is not None: self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() else: self.nworkers = 1 self.rank = 0 # Setup losses and stuff # ---------------------------------------- ob_space = agent.observation_space ac_space = agent.action_space with tf.name_scope(agent.name): if isinstance(network, str): network = get_network_builder(network)(**network_kwargs) with tf.name_scope("pi"): pi_policy_network = network(ob_space.shape) pi_value_network = network(ob_space.shape) self.pi = pi = PolicyWithValue(ac_space, pi_policy_network, pi_value_network) with tf.name_scope("oldpi"): old_pi_policy_network = network(ob_space.shape) old_pi_value_network = network(ob_space.shape) self.oldpi = oldpi = PolicyWithValue(ac_space, old_pi_policy_network, old_pi_value_network) self.comm_matrix = agent.comm_matrix.copy() self.estimates = np.zeros([agent.nmates, nsteps], dtype=np.float32) self.multipliers = np.ones([self.agent.nmates, self.nsteps]).astype(np.float32) pi_var_list = pi_policy_network.trainable_variables + list( pi.pdtype.trainable_variables) old_pi_var_list = old_pi_policy_network.trainable_variables + list( oldpi.pdtype.trainable_variables) vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables self.pi_var_list = pi_var_list self.old_pi_var_list = old_pi_var_list self.vf_var_list = vf_var_list self.old_vf_var_list = old_vf_var_list if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=pi) load_path = load_path + '/agent_{}'.format(self.agent.id) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) print( colorize('Agent{}\'s Model restored!'.format(self.agent.id), color='magenta')) self.vfadam = MpiAdam(vf_var_list) self.get_flat = U.GetFlat(pi_var_list) self.set_from_flat = U.SetFromFlat(pi_var_list) self.loss_names = [ "Lagrange", "surrgain", "sync", "meankl", "entloss", "entropy" ] self.shapes = [var.get_shape().as_list() for var in pi_var_list]
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
default='MountainCarContinuous-v0') parser.add_argument('--num_timesteps', dest='num_timesteps', type=int, default=10000) parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) # create the environment env = gym.make(str(args.environment)) # initial_observation = env.reset() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) # env = bench.Monitor(env, logger.get_dir() and
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def learn(network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs * nsteps # Start total timer tstart = time.time() for update in range(1, total_timesteps // nbatch + 1): env.render() # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predictor of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn(policy, env, seed, nsteps=5, nstack=1, total_timesteps=int(80e6), vf_coef=0.9, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=20, continuous_actions=True, debug=False, numAgents=2, continueTraining=False, particleEnv=False, model_name='May15_test_model_', communication=False): timesteps = 100000 print('nsteps:', nsteps) # time.sleep(1) tf.reset_default_graph() # if particleEnv == False: set_global_seeds(seed) avg_rwd_data = [] episode_rwd_data = [] timesteps_data = [] policy_loss_data = [[], []] value_loss_data = [[], []] exp_var_data = [[], []] success_data = [] nenvs = env.num_envs print('Number of Environments: ', nenvs) print('Number of Steps', nsteps) nbatch = nenvs * nsteps print('Batch Size: ', nbatch) print('Learning Rate: ', lr) print('debug: ', debug) print('---------------------------------------------') ob_space = env.observation_space ac_space = env.action_space # print(ac_space) # print('action space: ', ac_space) num_procs = len(env.remotes) # HACK if numAgents == 1: model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, continuous_actions=continuous_actions, debug=debug) else: model = [] for i in range(numAgents): if particleEnv == True: model.append( Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, continuous_actions=continuous_actions, debug=debug, itr=i, particleEnv=particleEnv, communication=communication)) else: model.append( Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, continuous_actions=continuous_actions, debug=debug, itr=i, particleEnv=particleEnv)) # print('learn models') # print(model) # print(model[0]) if continueTraining == True: for i in range(numAgents): m_name = model_name + str(i) + '.pkl' # +'_600k.pkl' model[i].load(m_name) print('---------------------------------------------') print('Successfully Loaded ', m_name) print('---------------------------------------------') if numAgents == 1 or particleEnv == True: # print('Model: ', model) runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma, particleEnv=particleEnv) else: runner = [] for i in range(numAgents): if i == 0: # print('env: ', env) runner.append( Runner(env, model[i], nsteps=nsteps, nstack=nstack, gamma=gamma, ind=i, particleEnv=particleEnv)) # print('runner model values') # print(model[i].value) else: runner.append( Runner(env, model[i], nsteps=nsteps, nstack=nstack, gamma=gamma, ind=i, init_obs=runner[0].init_obs, particleEnv=particleEnv)) tstart = time.time() stored_rewards = [[], []] success = [] percent_exp = [] percent_exp_data = [] for update in range(1, total_timesteps // nbatch + 1): if numAgents == 1: obs, states, rewards, masks, actions, values = runner.run() elif particleEnv == True: # rework to pre-sort data by agent obs = [[], []] states = [[], []] rewards = [[], []] masks = [[], []] actions = [[], []] values = [[], []] policy_loss = [] value_loss = [] policy_entropy = [] ev = [] for i in range(nsteps): obs_, states_, rewards_, masks_, actions_, values_, success_ = runner.run( ) # print('Runner Shapes') # print(obs_.shape) # print(rewards_.shape) # print('states: ', states_) # print('masks: ', masks_) # print('actions_ ', actions_) # print('values: ', values_[:, 0]) # assert 1==0 # print('masks: ', masks_) # print(rewards_) if success_: success.append(success_) for j in range(numAgents): if runner.env.name == 'simple_reference': obs[j].append(obs_[:, :, j]) states[j].append(states[0]) rewards[j].append(rewards_[:, :, j]) actions[j].append(actions_[:, j]) values[j].append(values_[j]) masks[j].append(masks_[j]) else: # print('values: ', values_[j, :]) obs[j].append(obs_[:, :, j]) states[j].append(states[0]) rewards[j].append(rewards_[:, :, j]) actions[j].append(actions_[j, :]) values[j].append(values_[j, :]) masks[j].append(masks_[j, :]) # print('reward: ', rewards_[:, :, j]) stored_rewards[j].append(rewards_[:, :, j]) # print('rewards: ', rewards_[:, :, j]) else: obs = [[], []] states = [[], []] rewards = [[], []] masks = [[], []] actions = [[], []] values = [[], []] policy_loss = [] value_loss = [] policy_entropy = [] ev = [] # print('nsteps: ', runner[0].nsteps) for j in range(nsteps): for i in range( numAgents ): # Need to rewrite so that agents take turns stepping # obs[i], states[i], rewards[i], masks[i], actions[i], values[i] = runner[i].run() obs_, states_, rewards_, masks_, actions_, values_, = runner[ i].run() obs[i].append(obs_) # (obs_.shape = (6, 3, 3, 84)) states[i].append(states_) rewards[i].append(rewards_) masks[i].append(masks_) actions[i].append(actions_) values[i].append(values_) if runner[i].dones[0] == True: for k in range(nenvs): percent_exp.append( runner[i].env.envs[k].env.percent_explored[i]) stored_rewards[i].append(rewards_) # print(np.asarray(values).shape) if numAgents == 1: policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) elif particleEnv == False: for i in range(numAgents): # print(masks[i]) np.asarray(values[i]).reshape(nbatch) np.asarray(rewards[i]).reshape(nbatch) np.asarray(masks[i]).reshape(nbatch) policy_loss_, value_loss_, policy_entropy_ = model[i].train( np.asarray(obs[i]).reshape(nbatch, 84, 84, 3), states[i], np.asarray(rewards[i]).reshape(nbatch), np.asarray(masks[i]).reshape(nbatch), np.asarray(actions[i]).reshape(nbatch), np.asarray(values[i]).reshape(nbatch)) policy_loss.append(policy_loss_) value_loss.append(value_loss_) policy_entropy.append(policy_entropy_) ev.append( explained_variance( np.asarray(values[i]).reshape(nbatch), np.asarray(rewards[i]).reshape(nbatch))) else: if runner.env.name == 'simple_reference': actions_per_agent = 2 actions = np.asarray(actions).swapaxes(0, 2) for i in range(numAgents): if runner.env.name == 'simple_reference': # print(np.asarray(actions).shape) actions_i = np.asarray(actions[i]) actions_i = actions_i.swapaxes(0, 1).reshape( actions_per_agent, nbatch) action_n = [actions_i[1], actions_i[0]] policy_loss_, value_loss_, policy_entropy_ = model[ i].train( np.asarray(obs[i]).reshape(nbatch, 21), states[i], np.asarray(rewards[i]).reshape(nbatch), np.asarray(masks[i]).reshape(nbatch), action_n, np.asarray(values[i]).reshape(nbatch)) elif runner.env.name == 'simple_speaker_listener' or runner.env.name == 'simple_push': actions_ = np.asarray(actions) actions_i = actions_[i, :, :] # print('obs shape: ', np.asarray(obs).shape) obs_ = np.asarray(obs).swapaxes(1, 2) obs_i = obs_[i, 0].flatten() obs_n = [] for n in range(obs_i.shape[0]): for m in range(obs_i[0].shape[0]): obs_n.append(obs_i[n][m]) policy_loss_, value_loss_, policy_entropy_ = model[ i].train( np.asarray(obs_n).reshape( nbatch, runner.batch_obs_shape[i][0]), states[i], np.asarray(rewards[i]).reshape(nbatch), np.asarray(masks[i]).reshape(nbatch), np.asarray(actions_i).reshape(nbatch), np.asarray(values[i]).reshape(nbatch)) else: actions_ = np.asarray(actions) actions_i = actions_[i, :, :] policy_loss_, value_loss_, policy_entropy_ = model[ i].train( np.asarray(obs[i]).reshape( nbatch, runner.batch_obs_shape[i][0]), states[i], np.asarray(rewards[i]).reshape(nbatch), np.asarray(masks[i]).reshape(nbatch), np.asarray(actions_i).reshape(nbatch), np.asarray(values[i]).reshape(nbatch)) policy_loss.append(policy_loss_) value_loss.append(value_loss_) policy_entropy.append(policy_entropy_) ev.append( explained_variance( np.asarray(values[i]).reshape(nbatch), np.asarray(rewards[i]).reshape(nbatch))) # model.step_model.summarize_weights() nseconds = time.time() - tstart fps = float((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # print(success) # print(np.sum(np.asarray(success).flatten())) # print(np.size(np.asarray(success).flatten())) success_rate = np.mean(success) success_data.append(success_rate) percent_exp_data.append(np.mean(percent_exp)) sio.savemat('percent_exp.mat', {'percent_exp': np.asarray(percent_exp_data)}) sio.savemat('success.mat', {'success_rate': np.asarray(success_data)}) for i in range(numAgents): logger.record_tabular("*Model Number*", i) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", float(fps)) rewards_i = np.asarray(stored_rewards[i]) # print(rewards_i) avg_rwd = np.mean(rewards_i.flatten()) # print('rwd shape: ', rewards_i.shape) episode_reward = [] for j in range(nenvs): if particleEnv == False: episode_reward.append(np.mean(rewards_i[:, j])) else: episode_reward.append(np.mean(rewards_i[:, :, j])) print('min_eps_rwd: ', np.min(episode_reward)) print('max_eps_rwd: ', np.max(episode_reward)) # print('all rewards: ', rewards_i) logger.record_tabular("average agent reward", avg_rwd) avg_rwd_data.append(avg_rwd) episode_rwd_data.append(episode_reward) timesteps_data.append(update * nbatch) value_loss_data[i].append(float(value_loss[i])) exp_var_data[i].append(float(ev[i])) # percent_exp_data.append(np.mean(percent_exp)) sio.savemat('reward_data.mat', {'avg_rwd': np.asarray(avg_rwd_data)}) sio.savemat('episode_reward_data.mat', {'episode_avg_rwd': np.asarray(episode_rwd_data)}) sio.savemat('timesteps.mat', {'timesteps': np.asarray(timesteps_data)}) # logger.record_tabular("average episode reward", np.mean(episode_reward)) if particleEnv == False: policy_loss_data[i].append(float(policy_loss[i])) logger.record_tabular("policy_entropy", float(policy_entropy[i])) logger.record_tabular("value_loss", float(value_loss[i])) logger.record_tabular("policy_loss", float(policy_loss[i])) logger.record_tabular("explained_variance", ev[i]) # logger.record_tabular("success rate", success_rate) logger.record_tabular("percent_explored", float(np.mean(percent_exp))) logger.dump_tabular() elif runner.env.name != 'simple_reference': policy_loss_data[i].append(float(policy_loss[i])) logger.record_tabular("policy_entropy", float(policy_entropy[i])) logger.record_tabular("value_loss", float(value_loss[i])) logger.record_tabular("policy_loss", float(policy_loss[i])) logger.record_tabular("explained_variance", ev[i]) logger.record_tabular("success rate", success_rate) # logger.record_tabular("percent_explored", float(np.mean(percent_exp)) logger.dump_tabular() else: policy_loss_data[i].append( [float(policy_loss[i][0]), float(policy_loss[i][0])]) logger.record_tabular("comm_policy_entropy", float(policy_entropy[i][0])) logger.record_tabular("force_policy_entropy", float(policy_entropy[i][1])) logger.record_tabular("value_loss", float(value_loss[i])) logger.record_tabular("comm_policy_loss", float(policy_loss[i][0])) logger.record_tabular("force_policy_loss", float(policy_loss[i][1])) logger.record_tabular("explained_variance", ev[i]) logger.record_tabular("success rate", success_rate) logger.dump_tabular() sio.savemat('policy_loss.mat', {'policy_loss': np.asarray(policy_loss_data)}) sio.savemat('value_loss.mat', {'value_loss': np.asarray(value_loss_data)}) sio.savemat('exp_var.mat', {'exp_var': np.asarray(exp_var_data)}) m_name = model_name + str(i) + '.pkl' model[i].save(m_name) # print('Saving model as ', m_name) stored_rewards = [[], []] success = [] percent_exp = [] if particleEnv == False: if update * nbatch > timesteps: print('Saving ') for i in range(numAgents): m_name = model_name + str(i) + '_' + str( (timesteps + 600000.0) / 1000.0) + 'k.pkl' model[i].save(m_name) timesteps += 100000 else: update_interval = 10000 if update % update_interval == 0: print('Saving for update ', str(update)) for i in range(numAgents): m_name = model_name + str(i) + '_' + str( update / 1000.0) + 'k.pkl' model[i].save(m_name) env.close()
def learn(*, network, env, total_timesteps, dtarg=0.01, adaptive_kl=0, trunc_rho=1.0, useadv=0, vtrace=0, rgae=0, eval_env=None, seed=None, ERlen=1, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=None, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = 1 # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space acdim = ac_space.shape[0] # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, adaptive_kl=adaptive_kl) model = make_model() if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = EvalRunner(env=eval_env, model=model, nsteps=10 * nsteps, gamma=gamma, lam=lam) eval_runner.obfilt = runner.obfilt eval_runner.rewfilt = runner.rewfilt epinfobuf = deque(maxlen=10) if eval_env is not None: eval_epinfobuf = deque(maxlen=10) # Start total timer tfirststart = time.time() nupdates = total_timesteps // nbatch def add_vtarg_and_adv(seg, gamma, value, lam): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ done = np.append( seg["done"], 0 ) # last element is only used for last vtarg, but we already zeroed it if last new = 1 T = len(seg["rew"]) gaelam = np.empty(T, 'float32') rew = runner.rewfilt(seg["rew"]) lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1 - done[t + 1] delta = rew[t] + gamma * value[t + 1] * nonterminal - value[t] gaelam[ t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam ret = gaelam + value[:-1] return gaelam, ret def add_vtarg_and_adv_vtrace(seg, gamma, value, rho, trunc_rho, acdim=None): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ done = np.append( seg["done"], 0 ) # last element is only used for last vtarg, but we already zeroed it if last new = 1 rho_ = np.append(rho, 1.0) if acdim is not None: rho_ = np.exp(np.log(rho_) / acdim) r = np.minimum(trunc_rho, rho_) c = lam * np.minimum(1.0, rho_) T = len(seg["rew"]) gaelam = np.empty(T, 'float32') gaelam2 = np.empty(T, 'float32') rew = runner.rewfilt(seg["rew"]) lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1 - done[t + 1] delta = (rew[t] + gamma * value[t + 1] * nonterminal - value[t]) gaelam[t] = delta + gamma * lam * nonterminal * lastgaelam lastgaelam = r[t] * gaelam[t] ret = r[:-1] * gaelam + value[:-1] adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T] ]) - value[:-1] return adv, ret, r[:-1] * gaelam def add_vtarg_and_adv_vtrace4(seg, gamma, value, rho, trunc_rho, acdim=None): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ done = np.append( seg["done"], 0 ) # last element is only used for last vtarg, but we already zeroed it if last new = 1 rho_ = np.append(rho, 1.0) if acdim is not None: rho_ = np.exp(np.log(rho_) / acdim) T = len(seg["rew"]) gaelam = np.zeros(T, 'float32') rew = runner.rewfilt(seg["rew"]) delta = (rew + gamma * value[1:] * (1.0 - done[1:]) - value[:-1]) gamlam = np.zeros(T, 'float32') for i in range(T): gamlam[i] = (gamma * lam)**i idx = T c = np.ones(T) for t in reversed(range(T)): # print(delta2) for j in range(t, T): if done[j + 1]: idx = j + 1 break gaelam[t] = np.sum(gamlam[:idx - t] * (np.minimum(1.0, c) * delta)[t:idx]) c[t:] = rho_[t] * c[t:] ret = np.minimum(trunc_rho, rho_[:-1]) * gaelam + value[:-1] adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T] ]) - value[:-1] return adv, ret, np.minimum(trunc_rho, rho_[:-1]) * gaelam seg = None cliprangenow = cliprange(1.0) klconst = 1.0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange # Get minibatch if seg is None: prev_seg = seg seg = {} else: prev_seg = {} for i in seg: prev_seg[i] = np.copy(seg[i]) seg["ob"], seg["rew"], seg["done"], seg["ac"], seg["neglogp"], seg[ "mean"], seg[ "logstd"], final_obs, final_done, epinfos = runner.run() #pylint: disable=E0632 # print(np.shape(seg["ob"])) if prev_seg is not None: for key in seg: if len(np.shape(seg[key])) == 1: seg[key] = np.hstack([prev_seg[key], seg[key]]) else: seg[key] = np.vstack([prev_seg[key], seg[key]]) if np.shape(seg[key])[0] > ERlen * nsteps: seg[key] = seg[key][-ERlen * nsteps:] ob_stack = np.vstack([seg["ob"], final_obs]) values = model.values(runner.obfilt(ob_stack)) values[:-1] = (1.0 - final_done) * values[:-1] mean_now, logstd_now = model.meanlogstds(runner.obfilt(seg["ob"])) # print(np.shape(seg["ac"])[1]) neglogpnow = 0.5 * np.sum(np.square((seg["ac"] - mean_now) / np.exp(logstd_now)), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \ + np.sum(logstd_now, axis=-1) rho = np.exp(-neglogpnow + seg["neglogp"]) # print(len(mean_now)) # print(cliprangenow) # print(rho) if vtrace == 1: adv, ret, gae = add_vtarg_and_adv_vtrace(seg, gamma, values, rho, trunc_rho) if useadv: gae = adv elif vtrace == 4: adv, ret, gae = add_vtarg_and_adv_vtrace4(seg, gamma, values, rho, trunc_rho) if useadv: gae = adv else: gae, ret = add_vtarg_and_adv(seg, gamma, values, lam) r = np.minimum(1.0, rho) r_gae = gae * r print("======") print(gae) print(r_gae) print(gae.mean()) print(r_gae.mean()) print(gae.std()) print(r_gae.std()) print(r.mean()) print("======") if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, _, _, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 prior_row = np.zeros(len(seg["ob"])) temp_prior = [] for i in range(int(len(prior_row) / nsteps)): temp_row = np.mean( np.abs(rho[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0) # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row) print(temp_row) temp_prior.append(temp_row) if temp_row > 1 + 0.2: prior_row[i * nsteps:(i + 1) * nsteps] = 0 else: prior_row[i * nsteps:(i + 1) * nsteps] = 1 prior_row[i * nsteps:(i + 1) * nsteps] = 1 # print(prior_row) # for i in range(len(prior_row)): # if (np.abs(rho[i] - 1.0) + 1.0)>1.05: # prior_row[i]=0 # else: # prior_row[i]=1 # for i in range(len(prior_row)): # if rho[i]>1.1 : # prior_row[i]=0 # else: # prior_row[i]=1 prob = prior_row / np.sum(prior_row) print(np.sum(prior_row)) epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] # Index of each element of batch_size # Create the indices array inds1 = np.arange(len(seg["ob"]) - nsteps) inds2 = np.arange(nsteps) + len(seg["ob"]) - nsteps print(len(seg["ob"])) print(cliprangenow) nbatch_adapt1 = int((len(seg["ob"]) - nsteps) / nsteps * nbatch_train) nbatch_adapt2 = int((nsteps) / nsteps * nbatch_train) print(rho) idx1 = [] idx2 = [] kl_rest = np.ones(len(seg["ob"])) * len(seg["ob"]) / nsteps kl_rest[:-nsteps] = 0 # print(kl_rest) for _ in range(noptepochs): # Randomize the indexes # np.random.shuffle(inds) # 0 to batch_size with batch_train_size step # print(nbatch_adapt) losses_epoch = [] for _ in range(int(nsteps / nbatch_train)): if nbatch_adapt1 > 0: idx1 = np.random.choice(inds1, nbatch_adapt1) idx2 = np.random.choice(inds2, nbatch_adapt2) # print(np.mean(np.abs(rho[mbinds] - 1.0) + 1.0)) idx = np.hstack([idx1, idx2]).astype(int) slices = (arr[idx] for arr in (runner.obfilt(seg["ob"]), ret, gae, seg["done"], seg["ac"], values[:-1], neglogpnow, seg["mean"], seg["logstd"], kl_rest, rho)) loss_epoch = model.train(lrnow, cliprangenow, klconst, rgae, trunc_rho, *slices) mblossvals.append(loss_epoch) losses_epoch.append(loss_epoch) # print(np.mean(losses_epoch, axis=0)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) if adaptive_kl: print("KL avg :", lossvals[3]) if lossvals[3] > dtarg * 1.5: klconst *= 2 print("kl const is increased") elif lossvals[3] < dtarg / 1.5: klconst /= 2 print("kl const is reduced") klconst = np.clip(klconst, 2**(-10), 64) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values[:-1], ret) logger.logkv("batch IS weight", [int(1000 * s) / 1000. for s in np.array(temp_prior)]) logger.logkv("kl const", klconst) logger.logkv("clipping factor", cliprangenow) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfos])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfos])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def train(): rank = MPI.COMM_WORLD.Get_rank() sess = utils.make_gpu_session(args.num_gpu) sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) assert args.reload_dir is not None, "reload_dir cannot be None!" param_fname = os.path.join(args.reload_dir, 'param.json') with open(param_fname, 'r') as f: param = json.load(f) workerseed = param["seed"] + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) if param["use_2D_env"]: config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'configs', 'husky_space7_ppo2_2D.yaml') raw_env = Husky2DNavigateEnv(gpu_idx=args.gpu_idx, config=config_file, pos_interval=param["pos_interval"]) else: config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'configs', 'husky_space7_ppo2.yaml') raw_env = Husky1DNavigateEnv(gpu_idx=args.gpu_idx, config=config_file, ob_space_range=[0.0, 40.0]) # configure environment raw_env.reset_state_space(use_goal_info=param["use_goal_info"], use_coords_and_orn=param["use_coords_and_orn"], raycast_num=param["raycast_num"], raycast_range=param["raycast_range"]) raw_env.reset_goal_range(goal_range=param["goal_range"]) env = Monitor( raw_env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) policy_fn = FeedbackPolicy print('here') base_dirname = os.path.join(currentdir, "simulation_and_analysis", "rslts") print(base_dirname) if not os.path.exists(base_dirname): os.mkdir(base_dirname) dir_name = "husky_ppo2_" if param["use_feedback"]: dir_name += "hr" elif param["use_rich_reward"]: dir_name += "rl_rich" else: dir_name += "rl_sparse" dir_name += "_reload" dir_name = addDateTime(dir_name) dir_name = os.path.join(base_dirname, dir_name) if not os.path.exists(dir_name): os.mkdir(dir_name) hyperparams = { "seed": args.seed, "nsteps": param["nsteps"], "total_timesteps": args.total_timesteps, "use_2D_env": param["use_2D_env"], "use_rich_reward": param["use_rich_reward"], "use_multiple_starts": param["use_multiple_starts"], "use_goal_info": param["use_goal_info"], "use_coords_and_orn": param["use_coords_and_orn"], "raycast_num": param["raycast_num"], "raycast_range": param["raycast_range"], "goal_range": param["goal_range"], "use_feedback": args.use_feedback, "use_real_feedback": args.use_real_feedback, "trans_by_interpolate": args.trans_by_interpolate, "only_use_hr_until": args.only_use_hr_until, "trans_to_rl_in": args.trans_to_rl_in, "good_feedback_acc": param["good_feedback_acc"], "bad_feedback_acc": param["bad_feedback_acc"], "ppo_lr": args.ppo_lr, "ppo_batch_size": args.ppo_batch_size, "ppo_minibatch_size": param["ppo_minibatch_size"], "init_rl_importance": args.init_rl_importance, "ent_coef": args.ent_coef, "gamma": args.gamma, "lambda": args.lam, "cliprange": args.cliprange, "max_grad_norm": args.max_grad_norm, "ppo_noptepochs": args.ppo_noptepochs, "feedback_lr": param["feedback_lr"], "feedback_batch_size": param["feedback_batch_size"], "feedback_minibatch_size": param["feedback_minibatch_size"], "feedback_noptepochs": param["feedback_noptepochs"], "min_feedback_buffer_size": param["min_feedback_buffer_size"], "feedback_training_prop": param["feedback_training_prop"], "feedback_training_new_prop": param["feedback_training_new_prop"], "pos_interval": param["pos_interval"], "use_embedding": raw_env._use_embedding, "use_raycast": raw_env._use_raycast, "offline": raw_env.config['offline'], "reload_dir": args.reload_dir, "prev_total_timesteps": param["total_timesteps"] } param_fname = os.path.join(dir_name, "param.json") with open(param_fname, "w") as f: json.dump(hyperparams, f, indent=4, sort_keys=True) video_name = os.path.join(dir_name, "video.mp4") p_logging = p.startStateLogging(p.STATE_LOGGING_VIDEO_MP4, video_name) model_dir = os.path.join(args.reload_dir, 'models') max_model_iter = -1 for fname in os.listdir(model_dir): if fname.isdigit(): model_iter = int(fname) if model_iter > max_model_iter: max_model_iter = model_iter reload_name = os.path.join(model_dir, fname) performance = learn( policy=policy_fn, env=env, raw_env=raw_env, use_2D_env=param["use_2D_env"], use_multiple_starts=param["use_multiple_starts"], use_rich_reward=param["use_rich_reward"], use_feedback=args.use_feedback, use_real_feedback=args.use_real_feedback, trans_by_interpolate=args.trans_by_interpolate, only_use_hr_until=args.only_use_hr_until, trans_to_rl_in=args.trans_to_rl_in, nsteps=param["nsteps"], total_timesteps=args.total_timesteps, ppo_lr=args.ppo_lr, cliprange=args.cliprange, max_grad_norm=args.max_grad_norm, ent_coef=args.ent_coef, gamma=args.gamma, lam=args.lam, ppo_noptepochs=args.ppo_noptepochs, ppo_batch_size=args.ppo_batch_size, ppo_minibatch_size=param["ppo_minibatch_size"], init_rl_importance=args.init_rl_importance, feedback_lr=param["feedback_lr"], feedback_noptepochs=param["feedback_noptepochs"], feedback_batch_size=param["feedback_batch_size"], feedback_minibatch_size=param["feedback_minibatch_size"], min_feedback_buffer_size=param["min_feedback_buffer_size"], feedback_training_prop=param["feedback_training_prop"], feedback_training_new_prop=param["feedback_training_new_prop"], good_feedback_acc=param["good_feedback_acc"], bad_feedback_acc=param["bad_feedback_acc"], log_interval=1, save_interval=5, reload_name=reload_name, base_path=dir_name) p.stopStateLogging(p_logging) performance_fname = os.path.join(dir_name, "performance.p") with open(performance_fname, "wb") as f: pickle.dump(performance, f)
parser.add_argument('--n_steps', type=float, default=1e8) boolean_flag(parser, 'dueling', default=True) boolean_flag(parser, 'norm', default=True) boolean_flag(parser, 'double', default=True) boolean_flag(parser, 'render', default=False) args = parser.parse_args() # n_steps = int(1e8) n_steps = int(args.n_steps) train_level = 'level1' test_levels = ['level1', 'level2', 'level3'] # Create the environment. env = GridWorld(train_level) coords_shape = env.unwrapped.coords_shape set_global_seeds(args.seed) env.seed(args.seed) print('~~~~~~~~~~~~~~~~~~~~~~') print(env) print(env.unwrapped.name) print('observations:', env.observation_space.shape) print('coords: ', coords_shape) print('actions: ', env.action_space.n) print('walls: ', env.unwrapped.walls.shape) print('~~~~~~~~~~~~~~~~~~~~~~') # Generate the observations and ground truth Q-frames. test_obs = [] test_qmaps = []
def learn( network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs*nsteps # Start total timer tstart = time.time() for update in range(1, total_timesteps//nbatch+1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) nseconds = time.time()-tstart # Calculate the fps (frame per second) fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn( *, network, env, eval_env, make_eval_env, env_id, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, # MBL # For train mbl mbl_train_freq=10, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, eval_targs=('mbmf', 'mf'), #eval_targs=('mf',), quant=2, # For mbl.step #num_samples=(1500,), num_samples=(1, ), #horizon=(5,), horizon=(2, 1), #num_elites=(10,), num_elites=(1, ), mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=max((5, )), #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # MBL # --------------------------------------- viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append( ('MeanRewMBMF-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'. format(n, h, e, l, mbl_sh, use_mean_elites), 'MBMF-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format( n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRewMF', 'MF', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) pool = Pool(mp.cpu_count()) warm_start_done = False # ---------------------------------------- atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- if traj_collect == 'mf': seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if traj_collect == 'mf-random' or traj_collect == 'mf-mb': seg_mbl = seg_gen_mbl.__next__() else: seg_mbl = seg add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...], ob[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"] mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...], ob_mbl[1:, 0, ...]) mbl.update_forward_dynamic(require_update=iters_so_far % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if traj_collect == 'mf': #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb': vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: # MBL evaluation if not collect_val_data: set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [e[0] for e in all_eval_descs] list_legend_names = [e[1] for e in all_eval_descs] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.record_tabular(field_name, perf) if measure_time: logger.record_tabular( 'Time-%s' % (field_name), elapsed_time) if measure_rew: logger.record_tabular( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.record_tabular('FwdValError', fwd_dynamics_err) logger.dump_tabular() #print(logger.get_dir()) #print(targs_names) if num_eval_episodes > 0: win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') # ----------- yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format(len(val_dataset_collect)))