コード例 #1
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, structure):
    """
    Run deep maximum entropy inverse reinforcement learning on the objectworld
    MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
        () is no neural network (linear maximum entropy) and (3, 4) is two
        hidden layers with dimensions 3 and 4.
    """

    wind = 0.3
    trajectory_length = 8
    l1 = l2 = 0

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])
    feature_matrix = ow.feature_matrix(discrete=False)
    r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                        feature_matrix,
                        ow.n_actions,
                        discount,
                        ow.transition_probability,
                        trajectories,
                        epochs,
                        learning_rate,
                        l1=l1,
                        l2=l2)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
コード例 #2
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    print("ow.n_states", ow.n_states)
    print("ow.n_actions", ow.n_actions)
    print("ow.transition_probability", ow.transition_probability,
          len(ow.transition_probability), len(ow.transition_probability[0]),
          len(ow.transition_probability[0][0]))
    print("ground_r", ground_r, len(ground_r))
    print("ow.discount", ow.discount)

    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])
    print(trajectories)
    feature_matrix = ow.feature_matrix(discrete=False)
    print("feature_matrix", feature_matrix, len(feature_matrix),
          len(feature_matrix[0]))

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
コード例 #3
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, start_state):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    sx, sy = start_state
    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)

    ow.plot_grid()

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)

    print("Policy = ", policy.shape)
    #    print ("policy - {}".format(policy))
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])

    print("trajectories = ", trajectories.shape)
    #    for t in trajectories:
    #        ow.plot_grid("trajectory_{}.png".format(t), t)
    #    for t in trajectories:
    #        for s, a, r in t:
    #            print (ow.int_to_point(s), ow.actions[a], r)
    #        print ("---------")

    feature_matrix = ow.feature_matrix(discrete=False)

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   r,
                                   ow.discount,
                                   stochastic=False)

    new_trajectory = ow.generate_trajectories(1, trajectory_length,
                                              lambda s: recovered_policy[s],
                                              False, (sx, sy))
    print("new trajectory")
    for t in new_trajectory:
        ow.plot_grid("new_trajectory.png", t)
        for s, a, rw in t:
            print(ow.int_to_point(s), ow.actions[a], rw)
        print("---------")
    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.savefig("reward.png", format="png", dpi=150)
コード例 #4
0
def main(grid_size,
         discount,
         n_objects,
         n_colours,
         n_trajectories,
         epochs,
         learning_rate,
         start_state,
         wind=0.0,
         algo="maxnet",
         mdp="gridworld"):
    """
    Run inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    start_state: start location to generate trajectory from
    algo: IRL algo to run (Currently, support maxnet and deep_maxnet)
    """

    sx, sy = start_state
    trajectory_length = 8

    if mdp == "objectworld":
        import irl.mdp.objectworld as objectworld
        ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                     discount)
    elif mdp == "gridworld":
        import irl.mdp.gridworld as gridworld
        ow = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    optimal_v = optimal_value(ow.n_states,
                              ow.n_actions, ow.transition_probability,
                              normalize(ground_r), ow.discount)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s],
                                            random_start=True)

    feature_matrix = ow.feature_matrix()

    print("trajectories = ", trajectories.shape)
    print("epochs = ", epochs)
    print("feature_matrix.shape = ", feature_matrix.shape)
    print("policy.shape = ", policy.shape)
    #    ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind), value=optimal_v)
    ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                    epochs, wind),
                 policy=policy,
                 value=optimal_v)

    r = []
    ground_svf = []
    if algo == "maxent":
        import irl.maxent as maxent
        ground_svf = maxent.find_svf(ow.n_states, trajectories)
        r = maxent.irl(feature_matrix, ow.n_actions, discount,
                       ow.transition_probability, trajectories, epochs,
                       learning_rate)
    elif algo == "deep_maxnet":
        import irl.deep_maxent as deep_maxent
        l1 = l2 = 0
        structure = (3, 3)
        r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                            feature_matrix,
                            ow.n_actions,
                            discount,
                            ow.transition_probability,
                            trajectories,
                            epochs,
                            learning_rate,
                            l1=l1,
                            l2=l2)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   normalize(r),
                                   ow.discount,
                                   stochastic=False)
    recovered_v = value(recovered_policy, ow.n_states,
                        ow.transition_probability, normalize(r), ow.discount)

    new_trajectory = ow.generate_trajectories(n_trajectories,
                                              trajectory_length,
                                              lambda s: recovered_policy[s],
                                              True, (sx, sy))
    recovered_svf = maxent.find_svf(ow.n_states, new_trajectory)

    #    ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind),
    #                                value=recovered_v)
    ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                 policy=recovered_policy,
                 value=recovered_v)

    #    print("new trajectory")
    #    for t in new_trajectory:
    #        for s, a, rw in t:
    #            print (ow.int_to_point(s), ow.actions[a], rw)
    #        print ("---------")
    y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5]

    plt.subplot(111)

    plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth SVF")
    plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                       epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size)))
    plt.title("Recovered SVF")
    plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size)))
    plt.title("Recovered reward")
    plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)