Ejemplo n.º 1
0
def train(discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    trajectory_length = 276

    env = Game.GameState()
    trajectories = env.generate_trajectories(n_trajectories, trajectory_length,
                                             env.optimal_policy_deterministic)
    feature_matrix = env.feature_matrix()

    r = maxent.irl(feature_matrix, env.n_actions, discount,
                   env.transition_probability, trajectories, epochs,
                   learning_rate, "flappy_alpha_%d.pkl", "flappy_alpha_96.pkl",
                   96)

    pkl.dump(r, open("flappy_maxent_reward.pkl", 'wb'))

    return r
Ejemplo n.º 2
0
def train(discount):
    """
    Run linear programming inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    """

    env = Game.GameState()

    r = linear_irl.irl(env.n_states, env.n_actions, env.transition_probability,
                       env.get_policy(), discount, 1, 5)

    pkl.dump(r, open("flappy_lp_reward.pkl", 'wb'))
def main(discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure):
    # n_objects, n_colours 随便给的
    """
    Run deep maximum entropy inverse reinforcement learning on the objectworld
    MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
        () is no neural network (linear maximum entropy) and (3, 4) is two
        hidden layers with dimensions 3 and 4.
    """

    trajectory_length = 268
    l1 = l2 = 0

    # env = Env(n_objects, n_colours)
    env=Game.GameState()

    # ground_r = np.array([env.reward_deep_maxent(s) for s in range(env.n_states)])
    # policy = find_policy(env.n_states, env.n_actions, env.transition_probability,
    #                      ground_r, discount, stochastic=False)
    # trajectories = env.generate_trajectories(n_trajectories, trajectory_length,
    trajectories = env.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            env.optimal_policy_deterministic)
    # feature_matrix = env.feature_matrix_deep_maxent(discrete=False)

    feature_matrix = env.feature_matrix()


    r = deep_maxent.irl((feature_matrix.shape[1],) + structure, feature_matrix,
        env.n_actions, discount, env.transition_probability, trajectories, epochs,
        learning_rate, l1=l1, l2=l2)

    pkl.dump(r, open('flappy_deep_maxent_reward.pkl', 'wb'))
Ejemplo n.º 4
0
def train(discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    trajectory_length = 276

    env = Game.GameState()
    trajectories = env.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            env.optimal_policy_deterministic)

    def feature_function(state):
        feature = np.zeros(env.n_states)
        feature[state]=1
        return feature

    def transitionProbability(state_code, action):
        res = {}
        for i in range(env.n_states):
            res[state_code]=env._transition_probability(state_code, action, i)
        return res

    irl = LargeGradientIRL(env.n_actions, env.n_states, transitionProbability,
                           feature_function, discount, learning_rate, trajectories, epochs)
    result = irl.gradientIterationIRL()

    reward=result[-1][0].reshape(env.n_states, )
    pkl.dump(result, open("lg_result.pkl", 'wb'))
    pkl.dump(reward, open("lg_reward.pkl", 'wb'))

    return reward
Ejemplo n.º 5
0
    irl = LargeGradientIRL(env.n_actions, env.n_states, transitionProbability,
                           feature_function, discount, learning_rate, trajectories, epochs)
    result = irl.gradientIterationIRL()

    reward=result[-1][0].reshape(env.n_states, )
    pkl.dump(result, open("lg_result.pkl", 'wb'))
    pkl.dump(reward, open("lg_reward.pkl", 'wb'))

    return reward

if __name__ == '__main__':
    with tf.device('/cpu:0'):
        train(0.01, 1, 400, 0.01)
    rewards = pkl.load(open("flappy_maxent_reward.pkl", 'rb'))

    env = Game.GameState(prepare_tp=True)

    value = vi.value(env.get_policy(), env.n_states, env.transition_probability, rewards, 0.3)
    opt_value = vi.optimal_value(env.n_states, env.n_actions, env.transition_probability, rewards, 0.3)
    pkl.dump(value, open("flappy_maxent_value.pkl", 'wb'))
    pkl.dump(opt_value, open("flappy_maxent_opt_value.pkl", 'wb'))


    value=pkl.load(open("flappy_maxent_value.pkl", 'rb'))
    opt_value=pkl.load(open("flappy_maxent_opt_value.pkl", 'rb'))

    status = validate(value)
    print(status)
    pkl.dump(status, open("flappy_maxent_status.pkl", 'wb'))
    status = validate(opt_value)
    print(status)