Exemple #1
def learn_with_VI(P, nS, nA, terminal_states):
    V = value_iteration(P, nS, nA, terminal_states)
    policy = np.zeros((nS)).astype(int)

    for state in range(nS):
        opt_reward = 0
        opt_action = 0
        for action in range(nA):
            reward = 0
            for i in range(len(P[state][action])):
                # print("possible next state %d, prob %g, V %g, E(V) %g" % (P[state][action][i][1], P[state][action][i][0], V[P[state][action][i][1]], P[state][action][i][0] * V[P[state][action][i][1]]))
                reward += P[state][action][i][0] * V[P[state][action][i][1]]
                # print("state %d, action %d, reward %g" % (state, action, reward))
            if reward > opt_reward:
                opt_reward = reward
                opt_action = action
        policy[state] = opt_action
Exemple #2
    P = initialize_P(env.nS, env.nA)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    policy = np.zeros((env.nS))
    rs = []
    for ep in range(num_episodes):
        terminal = False
        s = 0
        history = []
        while not terminal:
            #sample an action
            u = np.random.rand(1)
            if u > e:
                a = policy[s]
                a = np.random.randint(env.nA)

            #sample new state
            u = np.random.rand(1)
            for tup in env.P[s][a]:
                u = u - tup[0]
                if u <= 0:
                    t = tup
            history.append((s, a, t[1], t[2], t[3]))

            s = t[1]
            if t[3]:
                terminal = True

        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(counts, rewards)
        value, policy = value_iteration(P, env.nS, env.nA, gamma, 200, 1e-3)
        e = e * decay_rate
    np.save('model_based_rewards.npy', rs)
    return policy
  P = initialize_P(env.nS, env.nA)
  counts = initialize_counts(env.nS, env.nA)
  rewards = initialize_rewards(env.nS, env.nA)


  average_rewards = []

  history = []
  policy = np.zeros(env.nS, dtype=int)
  epsilon = e

  for i in range(num_episodes):
    # _, policy = value_iteration(P, env.nS, env.nA)
    state = env.reset()
    history[:] = []
    done = False
    reward = 0

    while not done:
      eps_check = random.random()
      action = 0

      if eps_check < epsilon:
        action = random.randint(0, env.nA - 1)
        action = policy[state]

      new_state, reward, done, _ = env.step(action)

      new_history = [state, action, reward, new_state, done]
      state = new_state

    if i < 1000:
      prev_reward = average_rewards[i] * i
      new_reward = (prev_reward + reward) / (i + 1)

    counts, rewards = update_mdp_model_with_history(counts, rewards, history)
    epsilon *= decay_rate
    P = counts_and_rewards_to_P(counts, rewards)
    _, policy = value_iteration(P, env.nS, env.nA)

  plt.ylabel('Average reward')

  return policy
Exemple #4
    P = initialize_P(env.nS, env.nA)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    policy = np.zeros(env.nS, dtype=int)
    for i in range(num_episodes):
        s = env.reset()
        done = False
        history = []
        while not done:
            # greedy
            if np.random.random() < e:
                a = np.random.randint(env.nA)
                a = policy[s]
            # update
            next_state, reward, done, _ = env.step(a)
            history.append([s, a, reward, next_state, done])
            s = next_state

        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(counts, rewards)
        V, policy = value_iteration(P, env.nS, env.nA)

        if i % 10 == 0:
            e *= decay_rate

    return policy
    P = initialize_P(env.nS, env.nA)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    for episode_idx in tqdm.tqdm(range(num_episodes)):
        # Choose a random starting state
        cur_state = env.reset()

        # Calculate a policy for this episode using what we know about the env so far (P)
        _, policy = value_iteration(P,

        # Start the episode. The episode ends when we reach a terminal state (i.e. "done is True")
        done = False
        history = []
        episode_reward = 0.0
        while not done:
            # Choose an action "epsilon-greedily" (where epsilon is the var "e")
            action = _choose_egreedy_action(env, cur_state, policy, e)

            # Use env's transition probs to "choose" next state
            next_state, reward, done, _ = env.step(action)

            # Record this step in the history
            history.append((cur_state, action, reward, next_state, done))

            # Move to next state
            cur_state = next_state

            episode_reward += reward

        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(counts, rewards)

        # If we're running this as part of 5d, then record the scores
        if episode_scores is not None:
            # NOTE: Here I am simply recording 0 or 1 (the undiscounted score).
            episode_scores[episode_idx] = episode_reward

        # Decay the randomness of our action selection (i.e. increase greediness)
        e *= decay_rate

    _, policy = value_iteration(P,

    return policy
    P = initialize_P(env.nS, env.nA)
    policy = np.zeros((env.nS)).astype(int)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    for k in range(num_episodes):
        print k
        history = render(env, policy, e)
        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(
            counts, rewards
        )  #P[state][action] is a list of (prob, next_state, reward, done) tuples.
        V, policy = value_iteration(P, env.nS, env.nA, gamma)
        e = e**decay_rate


    return policy