def algorithm_inner_loop(name, model): """ Generate a function which runs a round of training on a batch of rollouts. """ if name == 'a2c': a2c = A2C(model) optimizer = a2c.optimize(learning_rate=1e-2, max_grad_norm=None, rms_decay=0.9) return lambda rollouts: model.session.run(optimizer, a2c.feed_dict(rollouts)) elif name == 'ppo': ppo = PPO(model) optimizer = ppo.optimize(learning_rate=1e-3) return lambda rollouts: ppo.run_optimize(optimizer, rollouts, log_fn=print)
def run_ppo(): """ Run a training worker. """ env = gym.make('CartPole-v0') action_dist = gym_space_distribution(env.action_space) obs_vectorizer = gym_space_vectorizer(env.observation_space) with tf.Session() as sess: model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32]) # Deal with CartPole-v0 reward scale. model.scale_outputs(20) roller = BasicRoller(env, model, min_episodes=30) ppo = PPO(model) optimizer = MPIOptimizer(tf.train.AdamOptimizer(learning_rate=1e-3), -ppo.objective) sess.run(tf.global_variables_initializer()) optimizer.sync_from_root(sess) for i in range(50): rollouts = roller.rollouts() # pylint: disable=E1101 print('batch %d: rank=%d mean=%f' % (i, MPI.COMM_WORLD.Get_rank(), mean_total_reward(rollouts))) mpi_ppo(ppo, optimizer, rollouts, log_fn=print)
def main(): args = arg_parser().parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) conn = Conn(args.redis_host, args.redis_port, args.channel, obs_size=args.obs_size) with tf.Session() as sess: model = create_model(args, sess) roller = RemoteRoller(model, conn, min_timesteps=args.min_timesteps, min_horizon=args.min_horizon, min_step_batch=args.min_step_batch, timeout=args.env_timeout) ppo = PPO(model, epsilon=args.ppo_epsilon, entropy_reg=args.ppo_entropy) optimize = ppo.optimize(learning_rate=args.ppo_lr) saver = tf.train.Saver() ckpt_file = os.path.join(args.checkpoint, 'model.ckpt') sess.run(tf.global_variables_initializer()) if os.path.exists(args.checkpoint): saver.restore(sess, ckpt_file) for outer_iter in itertools.count(): rollouts = roller.rollouts() logging.info('mean cumulative reward: %f', sum(r.total_reward for r in rollouts) / len(rollouts)) ppo.run_optimize(optimize, rollouts, batch_size=args.ppo_batch, num_iter=args.ppo_iter, log_fn=lambda x: logging.info('%s', x)) if outer_iter % args.save_interval == 0: saver.save(sess, os.path.join(args.checkpoint, 'model.ckpt'))
def main(): args = arg_parser().parse_args() env = make_env(args) with tf.Session() as sess: model = make_model(args, sess, env) ppo = PPO(model, **ppo_kwargs(args)) print('Initializing model variables...') sess.run(tf.global_variables_initializer()) mpi_ppo_loop(ppo, env, **ppo_loop_kwargs(args), rollout_fn=lambda _: sess.run(model.reptile.apply_updates))
def main(): with tf.Session() as sess: print('Creating environment...') env = TFBatchedEnv(sess, Pong(), 8) env = BatchedFrameStack(env) print('Creating model...') model = CNN(sess, gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) print('Creating roller...') roller = TruncatedRoller(env, model, 128) print('Creating PPO graph...') ppo = PPO(model) optimize = ppo.optimize(learning_rate=3e-4) print('Initializing variables...') sess.run(tf.global_variables_initializer()) print('Training agent...') for i in count(): rollouts = roller.rollouts() for rollout in rollouts: if not rollout.trunc_end: print('reward=%f steps=%d' % (rollout.total_reward, rollout.total_steps)) total_steps = sum(r.num_steps for r in rollouts) ppo.run_optimize(optimize, rollouts, batch_size=total_steps // 4, num_iter=12, log_fn=print) if i % 5 == 0: print('Saving...') parameters = sess.run(tf.trainable_variables()) with open('params.pkl', 'wb+') as out_file: pickle.dump(parameters, out_file)
def main(): args = arg_parser().parse_args() print('Creating environments...') env = create_env(args.env, args.num_envs, args.num_sub_batches, args.fps, args.max_timesteps) env = wrap_env(env) try: print('Creating session...') with tf.Session() as sess: print('Creating PPO graph...') model = IMPALAModel(sess, *gym_spaces(env)) ppo = PPO(model, **ppo_kwargs(args)) print('Initializing model variables...') sess.run(tf.global_variables_initializer()) mpi_ppo_loop(ppo, env, **ppo_loop_kwargs(args)) finally: env.close()