Ejemplo n.º 1
0
def train_sagil(X: np.ndarray, y: np.ndarray, original_policy: policy.TabularPolicy, num_epochs: int=100,
                batch_size: int=100, learning_rate: float=0.001,
                approximate_weights: bool=False) -> policy.TabularPolicy:
    sagil_policy = policy.TabularPolicy(env.num_states, env.num_actions)
    advice_state_dist = grid.tabular.stationary_state_distribution(original_policy.matrix)
    for _ in tqdm.trange(num_epochs):
        indices = np.arange(len(X))
        np.random.shuffle(indices)
        for batch in range(int(np.ceil(len(X)/batch_size))):
            batch_indices = indices[batch*batch_size:(batch + 1)*batch_size]
            state_distribution = grid.tabular.stationary_state_distribution(sagil_policy.matrix)
            gradient_function = grid.tabular.log_stationary_derivative(
                sagil_policy.matrix, sagil_policy.log_gradient_matrix, state_distribution
            )

            if approximate_weights:
                weights = sagil_policy.probabilities(X[batch_indices])[np.arange(len(batch_indices)),
                                                                       y[batch_indices]] / \
                          original_policy.probabilities(X[batch_indices])[np.arange(len(batch_indices)),
                                                                          y[batch_indices]]
            else:
                weights = state_distribution[X[batch_indices]]/advice_state_dist[X[batch_indices]]
            weights /= len(weights)
            pi_theta = sagil_policy.probabilities(X[batch_indices])[np.arange(len(batch_indices)), y[batch_indices]]
            gradient = sagil_policy.log_gradient(X[batch_indices], y[batch_indices], weights)
            lsd = gradient_function[X[batch_indices]]
            gradient += np.sum((weights * np.log(pi_theta))[:, np.newaxis, np.newaxis] * lsd, axis=0)
            sagil_policy.parameters += learning_rate * gradient

    return sagil_policy
Ejemplo n.º 2
0
def _run():
    initial_policy = policy.TabularPolicy(env.num_states, env.num_actions)
    initial_policy.parameters[policy.BIAS, grid.RIGHT] = 5 + np.log(0.5)
    initial_policy.parameters[policy.BIAS, grid.UP] = 5

    # d = grid.tabular.stationary_state_distribution(initial_policy.matrix)
    print()

    states = []
    actions = []
    for episode in range(1000):
        is_terminal = False
        state = env.reset()
        while not is_terminal:
            action = np.random.choice(env.num_actions, p=initial_policy.probabilities(state)[0])
            advice_action = advice(state)
            states.append(state)
            actions.append(advice_action)
            state, _, is_terminal, _ = env.step(action)
    states = np.array(states)
    actions = np.array(actions)

    sklearn_policy = SklearnModel(states, actions)
    sklearn_length = test_policy(sklearn_policy)
    print(f"sklearn length: {sklearn_length}")

    # noinspection PyUnusedLocal
    supervised_policy = train_supervised(states, actions)
    supervised_length = test_policy(supervised_policy)
    print(f"supervised length: {supervised_length}")
    advice_policy = train_sagil(states, actions, initial_policy)
    advice_length = test_policy(advice_policy)
    print(f"advice length: {advice_length}")
Ejemplo n.º 3
0
def train_supervised(X: np.ndarray, y: np.ndarray, num_epochs: int=100, batch_size: int=100,
                     learning_rate: float=0.001) -> policy.TabularPolicy:
    supervised_policy = policy.TabularPolicy(env.num_states, env.num_actions)
    for _ in tqdm.trange(num_epochs):
        indices = np.arange(len(X))
        np.random.shuffle(indices)
        for batch in range(int(np.ceil(len(X)/batch_size))):
            batch_indices = indices[batch*batch_size:(batch + 1)*batch_size]
            gradient = supervised_policy.log_gradient(X[batch_indices], y[batch_indices])
            supervised_policy.parameters += learning_rate * gradient
    return supervised_policy
Ejemplo n.º 4
0
def _run():
    chain = build_chain()
    current_policy = policy.TabularPolicy(num_states, chain.num_actions)
    current_policy.parameters = np.array([[1, 1], [3, 1], [1, 1], [3, 2], [2, 2.5], [0, 0]])
    stationary_distribution = chain.stationary_state_distribution(current_policy.matrix)
    print(stationary_distribution)
    env = environments.tabular.TabularEnv(chain)
    actions = []
    state = env.reset()
    states = [state]
    for _ in range(10000):
        action = np.random.choice(2, p=current_policy.probabilities([state])[0])
        actions.append(action)
        state, _, _, _ = env.step(action)
        states.append(state)
    print(np.bincount(states)/len(states))
    lsd = chain.log_stationary_derivative(current_policy.matrix, current_policy.log_gradient_matrix,
                                          stationary_distribution)
    print(lsd[:, 0, 1])


    values = np.zeros((num_states,))
    mu = 0
    lr = 0.01
    log_probabilities = np.log(current_policy.probabilities(np.arange(num_states)))
    for state, action, next_state in zip(states[:-1], actions[:-1], states[1:]):
        logpi = log_probabilities[state][action]
        mu += lr * (logpi - mu)
        values[next_state] += lr * (logpi - mu + values[state] - values[next_state])

    reverse_rewards = np.zeros((num_states, num_actions))# det case
    for state, action, next_state in zip(states[:-1], actions[:-1], states[1:]):
        reverse_rewards[state, action] = values[next_state] - values[state]





    '''