Esempio n. 1
0
def train(discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    # wind = 0.3
    trajectory_length = 268

    # gw = gridworld.Gridworld(grid_size, wind, discount)
    env = Env()
    trajectories = env.generate_trajectories(n_trajectories, trajectory_length,
                                             env.optimal_policy_deterministic)
    feature_matrix = env.feature_matrix()

    r = maxent.irl(feature_matrix, env.n_actions, discount,
                   env.transition_probability, trajectories, epochs,
                   learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205)

    pkl.dump(r, open("maxent_reward.pkl", 'wb'))

    return r
def main(discount, n_objects, n_colours, n_trajectories, epochs, learning_rate,
         structure):
    # n_objects, n_colours 随便给的
    """
    Run deep maximum entropy inverse reinforcement learning on the objectworld
    MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    structure: Neural network structure. Tuple of hidden layer dimensions, e.g.,
        () is no neural network (linear maximum entropy) and (3, 4) is two
        hidden layers with dimensions 3 and 4.
    """

    trajectory_length = 268
    l1 = l2 = 0

    env = Env(n_objects, n_colours)

    # ground_r = np.array([env.reward_deep_maxent(s) for s in range(env.n_states)])
    # policy = find_policy(env.n_states, env.n_actions, env.transition_probability,
    #                      ground_r, discount, stochastic=False)
    # trajectories = env.generate_trajectories(n_trajectories, trajectory_length,
    policy = env.get_policy()
    trajectories = env.generate_trajectories(2, trajectory_length,
                                             lambda s: policy[s])
    # feature_matrix = env.feature_matrix_deep_maxent(discrete=False)

    feature_matrix = env.feature_matrix()

    r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                        feature_matrix,
                        env.n_actions,
                        discount,
                        env.transition_probability,
                        trajectories,
                        epochs,
                        learning_rate,
                        l1=l1,
                        l2=l2)

    pkl.dump(r, open('deep_maxent_reward.pkl', 'wb'))
Esempio n. 3
0
def train(grid_size, discount):
    """
    Run linear programming inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    """

    env = Env(prepare_tp=True)

    r = linear_irl.irl(env.n_states, env.n_actions, env.transition_probability,
                       env.get_policy(), discount, 1, 5)

    pkl.dump(r, open("lp_reward.pkl", 'wb'))
Esempio n. 4
0
def train(discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    trajectory_length = 268

    env = Env()
    trajectories = env.generate_trajectories(n_trajectories, trajectory_length,
                                             env.optimal_policy_deterministic)

    def feature_function(state):
        feature = np.zeros(env.n_states)
        feature[state] = 1
        return feature

    def transitionProbability(state_code, action):
        res = {}
        for i in range(env.n_states):
            res[state_code] = env._transition_probability(
                state_code, action, i)
        return res

    irl = LargeGradientIRL(env.n_actions, env.n_states, transitionProbability,
                           feature_function, discount, learning_rate,
                           trajectories, epochs)
    result = irl.gradientIterationIRL()

    reward = result[-1][0].reshape(env.n_states, )
    pkl.dump(result, open("lg_result.pkl", 'wb'))
    pkl.dump(reward, open("lg_reward.pkl", 'wb'))

    return reward
Esempio n. 5
0
    feature_matrix = env.feature_matrix()

    r = maxent.irl(feature_matrix, env.n_actions, discount,
                   env.transition_probability, trajectories, epochs,
                   learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205)

    pkl.dump(r, open("maxent_reward.pkl", 'wb'))

    return r


if __name__ == '__main__':
    train(0.01, 1, 400, 0.01)
    rewards = pkl.load(open("maxent_reward.pkl", 'rb'))

    env = Env(prepare_tp=True)

    value = vi.value(env.get_policy(), env.n_states,
                     env.transition_probability, rewards, 0.3)
    opt_value = vi.optimal_value(env.n_states, env.n_actions,
                                 env.transition_probability, rewards, 0.3)
    pkl.dump(value, open("maxent_value.pkl", 'wb'))
    pkl.dump(opt_value, open("maxent_opt_value.pkl", 'wb'))

    value = pkl.load(open("maxent_value.pkl", 'rb'))
    opt_value = pkl.load(open("maxent_opt_value.pkl", 'rb'))

    status = validate(value)
    print(status)
    pkl.dump(status, open("maxent_status.pkl", 'wb'))
    status = validate(opt_value)
Esempio n. 6
0
    discount: MDP discount factor. float.
    """

    env = Env(prepare_tp=True)

    r = linear_irl.irl(env.n_states, env.n_actions, env.transition_probability,
                       env.get_policy(), discount, 1, 5)

    pkl.dump(r, open("lp_reward.pkl", 'wb'))


if __name__ == '__main__':
    # train(5, 0.2)
    rewards = pkl.load(open("lp_reward.pkl", 'rb'))

    env = Env(prepare_tp=True)
    value = vi.value(env.get_policy(), env.n_states,
                     env.transition_probability, rewards, 0.3)
    opt_value = vi.optimal_value(env.n_states, env.n_actions,
                                 env.transition_probability, rewards, 0.3)
    pkl.dump(value, open("lp_value.pkl", 'wb'))
    pkl.dump(opt_value, open("lp_opt_value.pkl", 'wb'))

    # value = pkl.load(open("lp_value.pkl", 'rb'))
    # opt_value = pkl.load(open("lp_opt_value.pkl", 'rb'))

    status = validate(value)
    print(status)
    pkl.dump(status, open("lp_status.pkl", 'wb'))
    status = validate(value)
    print(status)
Esempio n. 7
0
def validate(reward):
    env = Env(prepare_tp=False)

    def get_next_state_code_by_state(state):
        states = []
        next_time_code = env._next_time_hash_code(state[1])
        for i in range(env.n_actions):
            states.append((env._next_position(state[0], i), next_time_code))
        print(state, "next_state", states)
        return states

    def chose_action(ob):
        states_code = [
            env._get_state_code(s) for s in get_next_state_code_by_state(ob)
        ]
        rewards = np.array([reward[sc] for sc in states_code])
        max_actions = np.where(rewards == np.max(rewards))

        return np.random.choice(max_actions[0])

    def run(env):
        env.reset()

        print("start running")
        i = 0
        img, observation, termial = env.frame_step(env.do_nothing)
        states = [env._get_ob_str(observation, 0)]

        while not termial:
            i += 1
            ob = env._get_ob_str(observation, i)
            action = chose_action(ob)

            img, observation_, termial = env.frame_step(action)

            observation = observation_
            states.append(env._get_ob_str(observation, i + 1))

            if i >= len(env.expert_states):
                print("Done")
                break
            if termial:
                print("Terminal, your airplane dead")
                break
        return states

    length = len(env.expert_states)
    states = run(env)

    E_time = 0
    for rob, exp in zip(states, env.expert_states):
        if not (rob[0] == exp[0] and rob[1] == exp[1]):
            E_time += 1
    print("E_time", len(env.expert_states), len(states))
    E_time += len(env.expert_states) - len(states)

    # self.status["KLnorm"].append(KL_norm)
    # self.status["KLAbsSum"].append(KL_abs_sum)
    # self.status["Error"].append(E)
    status = {
        "ErrorRate": E_time / length,
        "ErrorTime": E_time,
        "ResultLength": len(states),
        "ExpertLength": len(env.expert_states),
    }
    return status
Esempio n. 8
0
from PIL import Image
import pickle
import sys

from airplane import Env

gs = Env()
termial = False

while not termial:
    action = input()

    if action == "0":
        img, [x, y], terminal = gs.frame_step(0)
    elif action == '1':
        img, [x, y], terminal = gs.frame_step(1)
    elif action == 's':
        break
    elif action == 'r':
        gs.reset(hard=True)
    else:  # elif action=='2':
        img, [x, y], terminal = gs.frame_step(2)
    print([x, y], terminal)

    if terminal:
        break

# graph={}
# prev=None
# for i in st:
#     # cnt=0