Ejemplo n.º 1
0
    def test_estimate():
        from environment import GridWorldEnv
        env = GridWorldEnv(grid=[
            [0, 0, 0, 1],
            [0, 0, 0, 0],
            [0, -1, 0, 0],
            [0, 0, 0, 0],
        ])
        # Train Teacher
        teacher = PolicyIterationPlanner(env)
        teacher.plan()
        trajectories = []
        print("Gather demonstrations of teacher.")
        for i in range(20):
            s = env.reset()
            done = False
            steps = [s]
            while not done:
                a = teacher.act(s)
                n_s, r, done, _ = env.step(a)
                steps.append(n_s)
                s = n_s
            trajectories.append(steps)

        print("Estimate reward.")
        irl = MaxEntIRL(env)
        rewards = irl.estimate(trajectories, epoch=100)
        print(rewards)
        env.plot_on_grid(rewards)
Ejemplo n.º 2
0
    def test_plan():
        from environment import GridWorldEnv
        env = GridWorldEnv(grid=[
            [0, 0, 0, 1],
            [0, 0, 0, 0],
            [0, -1, 0, 0],
            [0, 0, 0, 0],
        ])
        print("Value Iteration.")
        vp = ValuteIterationPlanner(env)
        v = vp.plan()
        print(v.reshape(env.shape))

        print("Policy Iteration.")
        pp = PolicyIterationPlanner(env)
        v = pp.plan()
        print(v.reshape(env.shape))
        q = pp.policy_to_q(v, 0.9)
        print(np.sum(q, axis=1).reshape(env.shape))