예제 #1
0
def algorithm_inner_loop(name, model):
    """
    Generate a function which runs a round of training on
    a batch of rollouts.
    """
    if name == 'a2c':
        a2c = A2C(model)
        optimizer = a2c.optimize(learning_rate=1e-2, max_grad_norm=None,
                                 rms_decay=0.9)
        return lambda rollouts: model.session.run(optimizer,
                                                  a2c.feed_dict(rollouts))
    elif name == 'ppo':
        ppo = PPO(model)
        optimizer = ppo.optimize(learning_rate=1e-3)
        return lambda rollouts: ppo.run_optimize(optimizer, rollouts, log_fn=print)
예제 #2
0
def run_ppo():
    """
    Run a training worker.
    """
    env = gym.make('CartPole-v0')
    action_dist = gym_space_distribution(env.action_space)
    obs_vectorizer = gym_space_vectorizer(env.observation_space)

    with tf.Session() as sess:
        model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32])

        # Deal with CartPole-v0 reward scale.
        model.scale_outputs(20)

        roller = BasicRoller(env, model, min_episodes=30)
        ppo = PPO(model)
        optimizer = MPIOptimizer(tf.train.AdamOptimizer(learning_rate=1e-3),
                                 -ppo.objective)

        sess.run(tf.global_variables_initializer())
        optimizer.sync_from_root(sess)
        for i in range(50):
            rollouts = roller.rollouts()
            # pylint: disable=E1101
            print('batch %d: rank=%d mean=%f' %
                  (i, MPI.COMM_WORLD.Get_rank(), mean_total_reward(rollouts)))
            mpi_ppo(ppo, optimizer, rollouts, log_fn=print)
예제 #3
0
파일: __main__.py 프로젝트: jakeszler/qwop
def main():
    args = arg_parser().parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    conn = Conn(args.redis_host,
                args.redis_port,
                args.channel,
                obs_size=args.obs_size)

    with tf.Session() as sess:
        model = create_model(args, sess)
        roller = RemoteRoller(model,
                              conn,
                              min_timesteps=args.min_timesteps,
                              min_horizon=args.min_horizon,
                              min_step_batch=args.min_step_batch,
                              timeout=args.env_timeout)
        ppo = PPO(model,
                  epsilon=args.ppo_epsilon,
                  entropy_reg=args.ppo_entropy)
        optimize = ppo.optimize(learning_rate=args.ppo_lr)

        saver = tf.train.Saver()
        ckpt_file = os.path.join(args.checkpoint, 'model.ckpt')

        sess.run(tf.global_variables_initializer())
        if os.path.exists(args.checkpoint):
            saver.restore(sess, ckpt_file)

        for outer_iter in itertools.count():
            rollouts = roller.rollouts()
            logging.info('mean cumulative reward: %f',
                         sum(r.total_reward for r in rollouts) / len(rollouts))
            ppo.run_optimize(optimize,
                             rollouts,
                             batch_size=args.ppo_batch,
                             num_iter=args.ppo_iter,
                             log_fn=lambda x: logging.info('%s', x))
            if outer_iter % args.save_interval == 0:
                saver.save(sess, os.path.join(args.checkpoint, 'model.ckpt'))
예제 #4
0
def main():
    args = arg_parser().parse_args()
    env = make_env(args)
    with tf.Session() as sess:
        model = make_model(args, sess, env)
        ppo = PPO(model, **ppo_kwargs(args))
        print('Initializing model variables...')
        sess.run(tf.global_variables_initializer())
        mpi_ppo_loop(ppo, env, **ppo_loop_kwargs(args),
                     rollout_fn=lambda _: sess.run(model.reptile.apply_updates))
예제 #5
0
def main():
    with tf.Session() as sess:
        print('Creating environment...')
        env = TFBatchedEnv(sess, Pong(), 8)
        env = BatchedFrameStack(env)

        print('Creating model...')
        model = CNN(sess, gym_space_distribution(env.action_space),
                    gym_space_vectorizer(env.observation_space))

        print('Creating roller...')
        roller = TruncatedRoller(env, model, 128)

        print('Creating PPO graph...')
        ppo = PPO(model)
        optimize = ppo.optimize(learning_rate=3e-4)

        print('Initializing variables...')
        sess.run(tf.global_variables_initializer())

        print('Training agent...')
        for i in count():
            rollouts = roller.rollouts()
            for rollout in rollouts:
                if not rollout.trunc_end:
                    print('reward=%f steps=%d' %
                          (rollout.total_reward, rollout.total_steps))
            total_steps = sum(r.num_steps for r in rollouts)
            ppo.run_optimize(optimize,
                             rollouts,
                             batch_size=total_steps // 4,
                             num_iter=12,
                             log_fn=print)
            if i % 5 == 0:
                print('Saving...')
                parameters = sess.run(tf.trainable_variables())
                with open('params.pkl', 'wb+') as out_file:
                    pickle.dump(parameters, out_file)
예제 #6
0
def main():
    args = arg_parser().parse_args()
    print('Creating environments...')
    env = create_env(args.env, args.num_envs, args.num_sub_batches, args.fps,
                     args.max_timesteps)
    env = wrap_env(env)
    try:
        print('Creating session...')
        with tf.Session() as sess:
            print('Creating PPO graph...')
            model = IMPALAModel(sess, *gym_spaces(env))
            ppo = PPO(model, **ppo_kwargs(args))
            print('Initializing model variables...')
            sess.run(tf.global_variables_initializer())
            mpi_ppo_loop(ppo, env, **ppo_loop_kwargs(args))
    finally:
        env.close()