Ejemplo n.º 1
0
        s = env.reset()
        while True:
            a = policy.get_best_action(s)
            s, r, d, info = env.step(a)
            rewards[-1] += r
            if env.env.env.env.env.was_real_done:
                rewards.append(0)
                break
            if d:
                s = env.reset()
    return rewards


if __name__ == '__main__':
    logger.configure(f'{C.env_id}/logs_{time_stamp}')
    for k, v in C._asdict().items():
        logger.record_tabular(k, v)
    logger.dump_tabular()
    max_reward = tf.placeholder(tf.float32, name='max_reward')
    mean_reward = tf.placeholder(tf.float32, name='mean_reward')
    max_summary = tf.summary.scalar('max_rew', max_reward)
    mean_summary = tf.summary.scalar('mean_rew', mean_reward)

    with create_session(0) as sess:
        eval_env = make_atari(C.env_id, 113, 'eval')()
        envs = SubprocVecEnv(
            [make_atari(C.env_id, r + 1, 'train') for r in range(4)])
        model = Model(eval_env.observation_space.shape,
                      eval_env.action_space.n)
        runner = Runner(envs, model.policy, nb_rollout=C.nb_rollout)
        sess.run(tf.global_variables_initializer())