Beispiel #1
0
def main():
    # common style arguments for plotting
    style = {
        'border': {
            'color': 'red',
            'linewidth': 0.5
        },
    }

    # set-up mdp
    world, reward, terminal = setup_mdp()
    start = [0]

    # generate "expert" trajectories
    policy = lambda x: 1
    trajectories = list(
        T.generate_trajectories(5, world, policy, start, terminal))

    # return trajectories
    print('Starting MaxEnt')

    # maximum entropy reinforcement learning (non-causal)
    reward_maxent = maxent(world, terminal, trajectories)

    # maximum casal entropy reinforcement learning (non-causal)
    # reward_maxcausal = maxent_causal(world, terminal, trajectories)

    print(reward_maxent)
Beispiel #2
0
def generate_trajectories_from_policy_list(world,
                                           policy_list,
                                           n_trajectories_per_policy=100):
    start = [0]
    terminal = [world.size - 1]

    trajectories_list = []
    for i, policy in enumerate(policy_list):
        trajectories = list(
            generate_trajectories(n_trajectories_per_policy, world,
                                  policy_list[i], start, terminal))
        trajectories = [t._t for t in trajectories]
        trajectories_list.append(trajectories)
    return trajectories_list
Beispiel #3
0
def generate_expert_trajectories(world, reward, terminal):
    n_trajectories = 200  # the number of "expert" trajectories
    discount = 0.9  # discount for constructing an "expert" policy
    weighting = lambda x: x**50  # down-weight less optimal actions
    start = [0]  # starting states for the expert

    # compute the value-function
    value = S.value_iteration(world.p_transition, reward, discount)

    # create our stochastic policy using the value function
    policy = S.stochastic_policy_from_value(world, value, w=weighting)

    # a function that executes our stochastic policy by choosing actions according to it
    policy_exec = T.stochastic_policy_adapter(policy)

    # generate trajectories
    tjs = list(
        T.generate_trajectories(n_trajectories, world, policy_exec, start,
                                terminal))

    return tjs, policy
Beispiel #4
0
def generate_trajectories(world, reward, terminal):
    """
    Generate some "expert" trajectories.
    """
    # parameters
    n_trajectories = 5
    print("\nNumber of experts: %d\n" %(n_trajectories))
    discount = 0.7
    weighting = lambda x: x**5

    # set up initial probabilities for trajectory generation
    initial = np.zeros(world.n_states)
    initial[12] = 1.0

    # generate trajectories
    value = S.value_iteration(world.p_transition, reward, discount)
    policy = S.stochastic_policy_from_value(world, value, w=weighting)
    policy_exec = T.stochastic_policy_adapter(policy)
    tjs = list(T.generate_trajectories(n_trajectories, world, policy_exec, initial, terminal))

    return tjs, policy
Beispiel #5
0

world = SnakeLadderWorld(size=BOARD_SIZE, shortcut_density=SHORTCUT_DENSITY)

policy_1 = world.oso_policy
policy_2 = world._smartish_policy

policies = [policy_1, policy_2]



start = [0]
terminal = [BOARD_SIZE - 1]

trajectories_exact_1 = \
    list(trajectory.generate_trajectories(NUM_EXACT_TRAJECTORIES, world, policy_1, start, terminal))
    
trajectories_exact_2 = \
    list(trajectory.generate_trajectories(NUM_EXACT_TRAJECTORIES, world, policy_2, start, terminal))
    
    
np.seterr(all='raise')    
    
reward_exact_1 = execute_maxent(world, terminal, trajectories_exact_1)
reward_exact_2 = execute_maxent(world, terminal, trajectories_exact_2)

results = []
for i in range(NUM_PLAYER_TESTS):
    
    true_policy_num = 1 if np.random.uniform(0, 1) > 0.5 else 0