Python Objectworld Examples

Programming Language: Python

Namespace/Package Name: irl.mdp.objectworld

Method/Function: Objectworld

Examples at hotexamples.com: 4

Python Objectworld - 4 examples found. These are the top rated real world Python examples of irl.mdp.objectworld.Objectworld extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, structure):
    """
    Run deep maximum entropy inverse reinforcement learning on the objectworld
    MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
        () is no neural network (linear maximum entropy) and (3, 4) is two
        hidden layers with dimensions 3 and 4.
    """

    wind = 0.3
    trajectory_length = 8
    l1 = l2 = 0

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])
    feature_matrix = ow.feature_matrix(discrete=False)
    r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                        feature_matrix,
                        ow.n_actions,
                        discount,
                        ow.transition_probability,
                        trajectories,
                        epochs,
                        learning_rate,
                        l1=l1,
                        l2=l2)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()

Example #2

Show file

def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    print("ow.n_states", ow.n_states)
    print("ow.n_actions", ow.n_actions)
    print("ow.transition_probability", ow.transition_probability,
          len(ow.transition_probability), len(ow.transition_probability[0]),
          len(ow.transition_probability[0][0]))
    print("ground_r", ground_r, len(ground_r))
    print("ow.discount", ow.discount)

    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])
    print(trajectories)
    feature_matrix = ow.feature_matrix(discrete=False)
    print("feature_matrix", feature_matrix, len(feature_matrix),
          len(feature_matrix[0]))

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()

Example #3

Show file

def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, start_state):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    sx, sy = start_state
    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)

    ow.plot_grid()

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)

    print("Policy = ", policy.shape)
    #    print ("policy - {}".format(policy))
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])

    print("trajectories = ", trajectories.shape)
    #    for t in trajectories:
    #        ow.plot_grid("trajectory_{}.png".format(t), t)
    #    for t in trajectories:
    #        for s, a, r in t:
    #            print (ow.int_to_point(s), ow.actions[a], r)
    #        print ("---------")

    feature_matrix = ow.feature_matrix(discrete=False)

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   r,
                                   ow.discount,
                                   stochastic=False)

    new_trajectory = ow.generate_trajectories(1, trajectory_length,
                                              lambda s: recovered_policy[s],
                                              False, (sx, sy))
    print("new trajectory")
    for t in new_trajectory:
        ow.plot_grid("new_trajectory.png", t)
        for s, a, rw in t:
            print(ow.int_to_point(s), ow.actions[a], rw)
        print("---------")
    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.savefig("reward.png", format="png", dpi=150)

Example #4

Show file

def main(grid_size,
         discount,
         n_objects,
         n_colours,
         n_trajectories,
         epochs,
         learning_rate,
         start_state,
         wind=0.0,
         algo="maxnet",
         mdp="gridworld"):
    """
    Run inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    start_state: start location to generate trajectory from
    algo: IRL algo to run (Currently, support maxnet and deep_maxnet)
    """

    sx, sy = start_state
    trajectory_length = 8

    if mdp == "objectworld":
        import irl.mdp.objectworld as objectworld
        ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                     discount)
    elif mdp == "gridworld":
        import irl.mdp.gridworld as gridworld
        ow = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    optimal_v = optimal_value(ow.n_states,
                              ow.n_actions, ow.transition_probability,
                              normalize(ground_r), ow.discount)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s],
                                            random_start=True)

    feature_matrix = ow.feature_matrix()

    print("trajectories = ", trajectories.shape)
    print("epochs = ", epochs)
    print("feature_matrix.shape = ", feature_matrix.shape)
    print("policy.shape = ", policy.shape)
    #    ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind), value=optimal_v)
    ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                    epochs, wind),
                 policy=policy,
                 value=optimal_v)

    r = []
    ground_svf = []
    if algo == "maxent":
        import irl.maxent as maxent
        ground_svf = maxent.find_svf(ow.n_states, trajectories)
        r = maxent.irl(feature_matrix, ow.n_actions, discount,
                       ow.transition_probability, trajectories, epochs,
                       learning_rate)
    elif algo == "deep_maxnet":
        import irl.deep_maxent as deep_maxent
        l1 = l2 = 0
        structure = (3, 3)
        r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                            feature_matrix,
                            ow.n_actions,
                            discount,
                            ow.transition_probability,
                            trajectories,
                            epochs,
                            learning_rate,
                            l1=l1,
                            l2=l2)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   normalize(r),
                                   ow.discount,
                                   stochastic=False)
    recovered_v = value(recovered_policy, ow.n_states,
                        ow.transition_probability, normalize(r), ow.discount)

    new_trajectory = ow.generate_trajectories(n_trajectories,
                                              trajectory_length,
                                              lambda s: recovered_policy[s],
                                              True, (sx, sy))
    recovered_svf = maxent.find_svf(ow.n_states, new_trajectory)

    #    ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind),
    #                                value=recovered_v)
    ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                 policy=recovered_policy,
                 value=recovered_v)

    #    print("new trajectory")
    #    for t in new_trajectory:
    #        for s, a, rw in t:
    #            print (ow.int_to_point(s), ow.actions[a], rw)
    #        print ("---------")
    y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5]

    plt.subplot(111)

    plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth SVF")
    plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                       epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size)))
    plt.title("Recovered SVF")
    plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size)))
    plt.title("Recovered reward")
    plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)