コード例 #1
0
def run_value_iteration(transition_probabilities=np.ones((5, 2)) * 0.5,
                        rewards=[1, 0, 0, 0, 10],
                        horizon=10,
                        gamma=0.9):
    v = np.zeros(5)
    env = MarsRover(transition_probabilities, rewards, horizon)
    done = False
    state = env.reset()
    i = 0
    while not done:
        i += 1
        print(f"This is step {i}")
        r1 = (rewards[state - 1] + gamma * v[state - 1])
        r2 = (rewards[state + 1] + gamma * v[state + 1])
        action = np.argmax([r1, r2])
        if r1 == r2:
            action = np.random.randint(2)
        new_state, reward, done = env.step(action)
        v, converged = update_value_function(v, state, new_state, reward)
        if converged:
            break
        state = new_state

    final_reward = evaluate_agent(v, env)

    print(
        f"Your agent achieved a final accumulated reward of {final_reward} after {i} update steps."
    )

    return v, i, final_reward
コード例 #2
0
def run_policy_iteration(transition_probabilities=np.ones((5, 2)),
                         rewards=[1, 0, 0, 0, 10],
                         horizon=10):
    env = MarsRover(transition_probabilities, rewards, horizon)
    qs = np.zeros((5, 2))
    pi = np.random.randint(0, 2, size=5)

    done = False
    state = env.reset()
    i = 0
    while not done:
        i += 1
        print(f"This is step {i}")
        action = pi[state]
        new_state, reward, done = env.step(action)
        qs, pi, converged = update_policy(qs, pi, state, new_state, action,
                                          reward)
        if converged:
            break
        state = new_state

    final_reward = evaluate_policy(pi, env)

    print(
        f"Your policy achieved a final accumulated reward of {final_reward} after {i} update steps."
    )

    return pi, i, final_reward
コード例 #3
0
def run_value_iteration(transition_probabilities=np.ones((5, 2)) * 0.5,
                        rewards=[1, 0, 0, 0, 10],
                        horizon=10):
    env = MarsRover(transition_probabilities, rewards, horizon)
    done = False
    state = env.reset()
    i = 0
    while not done:
        i += 1
        print(f"This is step {i}")
        new_state, reward, done = env.step(action)

    final_reward = evaluate_agent(v, env)

    print(
        f"Your agent achieved a final accumulated reward of {final_reward} after {i} update steps."
    )

    return v, i, final_reward
def evaluate_policy_dp(pi=np.random.randint(2, size=5),
                       transition_probabilities=np.ones((5, 2)),
                       rewards=[1, 0, 0, 0, 10]):
    env = MarsRover(transition_probabilities=transition_probabilities,
                    rewards=rewards)
    i = 0
    while True:
        i += 1
    print(f"Policy was evaluated in {i} steps with resulting v {v}")
    return v, i