Ejemplo n.º 1
0
def expected_value_difference(n_states, n_actions, transition_probability,
                              reward, discount, p_start_state, optimal_value,
                              true_reward):
    """
    Calculate the expected value difference, which is a proxy to how good a
    recovered reward function is.

    n_states: Number of states. int.
    n_actions: Number of actions. int.
    transition_probability: NumPy array mapping (state_i, action, state_k) to
        the probability of transitioning from state_i to state_k under action.
        Shape (N, A, N).
    reward: Reward vector mapping state int to reward. Shape (N,).
    discount: Discount factor. float.
    p_start_state: Probability vector with the ith component as the probability
        that the ith state is the start state. Shape (N,).
    optimal_value: Value vector for the ground reward with optimal policy.
        The ith component is the value of the ith state. Shape (N,).
    true_reward: True reward vector. Shape (N,).
    -> Expected value difference. float.
    """

    policy = value_iteration.find_policy(n_states, n_actions,
                                         transition_probability, reward,
                                         discount)
    value = value_iteration.value(policy.argmax(axis=1), n_states,
                                  transition_probability, true_reward,
                                  discount)

    evd = optimal_value.dot(p_start_state) - value.dot(p_start_state)
    return evd
Ejemplo n.º 2
0
def main(grid_size, discount, L):
    wind = 0.3
    trajectory_length = 3 * grid_size
    gw = gridworld.Gridworld(grid_size, wind, discount)
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    #policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]#由确定性最优策略(自己预先设置的)求R
    #由强化学习求最优策略
    policy = find_policy(
        gw.n_states,
        gw.n_actions,
        gw.transition_probability,
        ground_r,
        discount,
    )
    # Need a value function for each basis function.
    feature_matrix = gw.feature_matrix()
    values = []
    for dim in range(feature_matrix.shape[1]):
        reward = feature_matrix[:, dim]
        values.append(
            value(policy, gw.n_states, gw.transition_probability, reward,
                  gw.discount))
    values = np.array(values).T

    rl1, rl2, rl1l2 = linear_irl.large_irl(values, gw.transition_probability,
                                           feature_matrix, gw.n_states,
                                           gw.n_actions, policy, L)
    return ground_r, rl1, rl2, rl1l2
Ejemplo n.º 3
0
def main(grid_size, discount):
    """
    Run large state space linear programming inverse reinforcement learning on
    the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    """

    wind = 0.3
    trajectory_length = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]

    # Need a value function for each basis function.
    feature_matrix = gw.feature_matrix()
    values = []
    for dim in range(feature_matrix.shape[1]):
        reward = feature_matrix[:, dim]
        values.append(
            value(policy, gw.n_states, gw.transition_probability, reward,
                  gw.discount))
    values = np.array(values)

    r = linear_irl.large_irl(values, gw.transition_probability, feature_matrix,
                             gw.n_states, gw.n_actions, policy)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
def main(grid_size, discount):
    """
    Run large state space linear programming inverse reinforcement learning on
    the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    """

    wind = 0.3
    trajectory_length = 3*grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]

    # Need a value function for each basis function.
    feature_matrix = gw.feature_matrix()
    values = []
    for dim in range(feature_matrix.shape[1]):
        reward = feature_matrix[:, dim]
        values.append(value(policy, gw.n_states, gw.transition_probability,
                            reward, gw.discount))
    values = np.array(values)

    r = linear_irl.large_irl(values, gw.transition_probability,
                        feature_matrix, gw.n_states, gw.n_actions, policy)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Ejemplo n.º 5
0
    r = maxent.irl(feature_matrix, env.n_actions, discount,
                   env.transition_probability, trajectories, epochs,
                   learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205)

    pkl.dump(r, open("maxent_reward.pkl", 'wb'))

    return r


if __name__ == '__main__':
    train(0.01, 1, 400, 0.01)
    rewards = pkl.load(open("maxent_reward.pkl", 'rb'))

    env = Env(prepare_tp=True)

    value = vi.value(env.get_policy(), env.n_states,
                     env.transition_probability, rewards, 0.3)
    opt_value = vi.optimal_value(env.n_states, env.n_actions,
                                 env.transition_probability, rewards, 0.3)
    pkl.dump(value, open("maxent_value.pkl", 'wb'))
    pkl.dump(opt_value, open("maxent_opt_value.pkl", 'wb'))

    value = pkl.load(open("maxent_value.pkl", 'rb'))
    opt_value = pkl.load(open("maxent_opt_value.pkl", 'rb'))

    status = validate(value)
    print(status)
    pkl.dump(status, open("maxent_status.pkl", 'wb'))
    status = validate(opt_value)
    print(status)
    pkl.dump(status, open("maxent_opt_status.pkl", 'wb'))
    status = validate(rewards)
def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples,
                 epochs, structure):
    """
    Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature
    map feature_map with n_samples paths.

    grid_size: Grid size. int.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    discrete: Whether the features should be discrete. bool.
    l1: L1 regularisation. float.
    l2: L2 regularisation. float.
    n_samples: Number of paths to sample.
    epochs: Number of epochs to run MaxEnt with.
    structure: Neural network structure tuple, e.g. (3, 3) would be a
        3-layer neural network with assumed inputs.
    -> Expected value difference for MaxEnt, DeepMaxEnt
    """

    # Basic gist of what we're doing here: Get the reward function using our
    # different IRL methods, use those to get a policy, evaluate that policy
    # using the true reward, and then return the difference in expected values.

    # Setup parameters.
    wind = 0.3
    discount = 0.9
    learning_rate = 0.01
    trajectory_length = 3 * grid_size

    # Make the objectworld and associated data.
    ow = Objectworld(grid_size, n_objects, n_colours, wind, discount)
    feature_matrix = ow.feature_matrix(discrete)
    ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)])
    optimal_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                 ow.transition_probability,
                                                 ground_reward,
                                                 discount).argmax(axis=1)
    trajectories = ow.generate_trajectories(n_samples, trajectory_length,
                                            optimal_policy.take)
    p_start_state = (
        np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
        trajectories.shape[0])

    # True value.
    optimal_V = value_iteration.optimal_value(ow.n_states, ow.n_actions,
                                              ow.transition_probability,
                                              ground_reward, ow.discount)

    # MaxEnt reward; policy; value.
    maxent_reward = deep_maxent.irl((feature_matrix.shape[1], ),
                                    feature_matrix,
                                    ow.n_actions,
                                    ow.discount,
                                    ow.transition_probability,
                                    trajectories,
                                    epochs,
                                    learning_rate,
                                    l1=l1,
                                    l2=l2)

    maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                ow.transition_probability,
                                                maxent_reward,
                                                discount).argmax(axis=1)
    maxent_V = value_iteration.value(maxent_policy, ow.n_states,
                                     ow.transition_probability, ground_reward,
                                     ow.discount)
    maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)

    # DeepMaxEnt reward; policy; value.
    deep_learning_rate = 0.005  # For the 32 x 32 experiments.
    deep_maxent_reward = deep_maxent.irl(
        (feature_matrix.shape[1], ) + structure,
        feature_matrix,
        ow.n_actions,
        ow.discount,
        ow.transition_probability,
        trajectories,
        epochs,
        deep_learning_rate,
        l1=l1,
        l2=l2)

    deep_maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions,
                                                     ow.transition_probability,
                                                     deep_maxent_reward,
                                                     discount).argmax(axis=1)
    deep_maxent_V = value_iteration.value(deep_maxent_policy, ow.n_states,
                                          ow.transition_probability,
                                          ground_reward, ow.discount)

    deep_maxent_EVD = (optimal_V.dot(p_start_state) -
                       deep_maxent_V.dot(p_start_state))

    plt.subplot(3, 3, 1)
    plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 2)
    plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
    plt.title("MaxEnt reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 3)
    plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt reward")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)

    plt.subplot(3, 3, 4)
    plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("Optimal policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 5)
    plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("MaxEnt policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 6)
    plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
               vmin=0,
               vmax=3)
    plt.title("DeepMaxEnt policy")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)

    plt.subplot(3, 3, 7)
    plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
    plt.title("Optimal value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 8)
    plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
    plt.title("MaxEnt value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.subplot(3, 3, 9)
    plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt value")
    plt.tick_params(labeltop=False,
                    labelbottom=False,
                    labelleft=False,
                    bottom=False,
                    top=False,
                    left=False,
                    right=False,
                    labelright=False)
    plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format(
        grid_size, n_objects, n_colours, discrete, n_samples, epochs,
        structure, l1, l2, np.random.randint(10000000)))

    return maxent_EVD, deep_maxent_EVD
def test_gw_once(grid_size, feature_map, n_samples, epochs, structure):
    """
    Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature
    map feature_map with n_samples paths.

    grid_size: Grid size. int.
    feature_map: Which feature map to use. String in {ident, coord, proxi}.
    n_samples: Number of paths to sample.
    epochs: Number of epochs to run MaxEnt with.
    structure: Neural network structure tuple, e.g. (3, 3) would be a
        3-layer neural network with assumed inputs.
    -> Expected value difference for MaxEnt, DeepMaxEnt
    """

    # Basic gist of what we're doing here: Get the reward function using our
    # different IRL methods, use those to get a policy, evaluate that policy
    # using the true reward, and then return the difference in expected values.

    # Setup parameters.
    wind = 0.3
    discount = 0.9
    learning_rate = 0.01
    trajectory_length = 3*grid_size

    # Make the gridworld and associated data.
    gw = Gridworld(grid_size, wind, discount)
    feature_matrix = gw.feature_matrix(feature_map)
    ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)])
    optimal_policy = value_iteration.find_policy(gw.n_states,
                                                 gw.n_actions,
                                                 gw.transition_probability,
                                                 ground_reward,
                                                 discount).argmax(axis=1)
    trajectories = gw.generate_trajectories(n_samples,
                                            trajectory_length,
                                            optimal_policy.take)
    p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) /
                     trajectories.shape[0])

    # True value.
    optimal_V = value_iteration.optimal_value(gw.n_states,
                                              gw.n_actions,
                                              gw.transition_probability,
                                              ground_reward, gw.discount)

    # MaxEnt reward; policy; value.
    maxent_reward = deep_maxent.irl((feature_matrix.shape[1],),
                                    feature_matrix,
                                    gw.n_actions,
                                    gw.discount,
                                    gw.transition_probability,
                                    trajectories, epochs, learning_rate)

    maxent_policy = value_iteration.find_policy(gw.n_states,
                                                gw.n_actions,
                                                gw.transition_probability,
                                                maxent_reward,
                                                discount).argmax(axis=1)
    maxent_V = value_iteration.value(maxent_policy,
                                     gw.n_states,
                                     gw.transition_probability,
                                     ground_reward,
                                     gw.discount)
    maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state)

    # DeepMaxEnt reward; policy; value.
    deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure,
                                         feature_matrix,
                                         gw.n_actions,
                                         gw.discount,
                                         gw.transition_probability,
                                         trajectories, epochs, learning_rate)
    deep_maxent_policy = value_iteration.find_policy(gw.n_states,
                                                     gw.n_actions,
                                                     gw.transition_probability,
                                                     deep_maxent_reward,
                                                     discount).argmax(axis=1)
    deep_maxent_V = value_iteration.value(deep_maxent_policy,
                                          gw.n_states,
                                          gw.transition_probability,
                                          ground_reward,
                                          gw.discount)
    deep_maxent_EVD = (optimal_V.dot(p_start_state) -
                       deep_maxent_V.dot(p_start_state))

    plt.subplot(3, 3, 1)
    plt.pcolor(ground_reward.reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 2)
    plt.pcolor(maxent_reward.reshape((grid_size, grid_size)))
    plt.title("MaxEnt reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 3)
    plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt reward")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)

    plt.subplot(3, 3, 4)
    plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("Optimal policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 5)
    plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3)
    plt.title("MaxEnt policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 6)
    plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)),
               vmin=0, vmax=3)
    plt.title("DeepMaxEnt policy")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)

    plt.subplot(3, 3, 7)
    plt.pcolor(optimal_V.reshape((grid_size, grid_size)))
    plt.title("Optimal value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 8)
    plt.pcolor(maxent_V.reshape((grid_size, grid_size)))
    plt.title("MaxEnt value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.subplot(3, 3, 9)
    plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size)))
    plt.title("DeepMaxEnt value")
    plt.tick_params(labeltop=False, labelbottom=False, labelleft=False,
                    bottom=False, top=False, left=False, right=False,
                    labelright=False)
    plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map,
        n_samples, epochs, structure, np.random.randint(10000000)))


    return maxent_EVD, deep_maxent_EVD
Ejemplo n.º 8
0
def main(grid_size,
         discount,
         n_objects,
         n_colours,
         n_trajectories,
         epochs,
         learning_rate,
         start_state,
         wind=0.0,
         algo="maxnet",
         mdp="gridworld"):
    """
    Run inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    start_state: start location to generate trajectory from
    algo: IRL algo to run (Currently, support maxnet and deep_maxnet)
    """

    sx, sy = start_state
    trajectory_length = 8

    if mdp == "objectworld":
        import irl.mdp.objectworld as objectworld
        ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                     discount)
    elif mdp == "gridworld":
        import irl.mdp.gridworld as gridworld
        ow = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    optimal_v = optimal_value(ow.n_states,
                              ow.n_actions, ow.transition_probability,
                              normalize(ground_r), ow.discount)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s],
                                            random_start=True)

    feature_matrix = ow.feature_matrix()

    print("trajectories = ", trajectories.shape)
    print("epochs = ", epochs)
    print("feature_matrix.shape = ", feature_matrix.shape)
    print("policy.shape = ", policy.shape)
    #    ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind), value=optimal_v)
    ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                    epochs, wind),
                 policy=policy,
                 value=optimal_v)

    r = []
    ground_svf = []
    if algo == "maxent":
        import irl.maxent as maxent
        ground_svf = maxent.find_svf(ow.n_states, trajectories)
        r = maxent.irl(feature_matrix, ow.n_actions, discount,
                       ow.transition_probability, trajectories, epochs,
                       learning_rate)
    elif algo == "deep_maxnet":
        import irl.deep_maxent as deep_maxent
        l1 = l2 = 0
        structure = (3, 3)
        r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                            feature_matrix,
                            ow.n_actions,
                            discount,
                            ow.transition_probability,
                            trajectories,
                            epochs,
                            learning_rate,
                            l1=l1,
                            l2=l2)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   normalize(r),
                                   ow.discount,
                                   stochastic=False)
    recovered_v = value(recovered_policy, ow.n_states,
                        ow.transition_probability, normalize(r), ow.discount)

    new_trajectory = ow.generate_trajectories(n_trajectories,
                                              trajectory_length,
                                              lambda s: recovered_policy[s],
                                              True, (sx, sy))
    recovered_svf = maxent.find_svf(ow.n_states, new_trajectory)

    #    ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind),
    #                                value=recovered_v)
    ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                 policy=recovered_policy,
                 value=recovered_v)

    #    print("new trajectory")
    #    for t in new_trajectory:
    #        for s, a, rw in t:
    #            print (ow.int_to_point(s), ow.actions[a], rw)
    #        print ("---------")
    y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5]

    plt.subplot(111)

    plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth SVF")
    plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                       epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size)))
    plt.title("Recovered SVF")
    plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size)))
    plt.title("Recovered reward")
    plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)