Ejemplo n.º 1
0
                    stored_actions[(t + 1) % (n + 1)] = action
            tau = t - n + 1

            if tau >= 0:
                # calculate G(tau:tau+n)
                G = np.sum([
                    discount_factor**(i - tau - 1) * stored_rewards[i %
                                                                    (n + 1)]
                    for i in range(tau + 1,
                                   min(tau + n, T) + 1)
                ])

                if tau + n < T:
                    G += discount_factor**n * Q[stored_states[
                        (tau + n) % (n + 1)]][stored_actions[(tau + n) %
                                                             (n + 1)]]

                tau_s, tau_a = stored_states[tau %
                                             (n + 1)], stored_actions[tau %
                                                                      (n + 1)]

                # update Q value with n step return
                Q[tau_s][tau_a] += alpha * (G - Q[tau_s][tau_a])

    return Q, stats


if __name__ == '__main__':
    Q, stats = n_step_sarsa(env, num_episodes=300, n=10)
    plots.plot_episode_stats(stats, file='results/n_step_sarsa/')
Ejemplo n.º 2
0
                    action_probs = behavior_policy(state)
                    action = np.random.choice(np.arange(nA), p=action_probs)
                    stored_actions[(t+1) % (n+1)] = action
            tau = t - n + 1
            if tau >= 0:
                # calculate rho
                rho = np.prod(
                    [target_policy(stored_states[i%(n+1)])[stored_actions[i%(n+1)]]/behavior_policy(stored_states[i%(n+1)])[stored_actions[i%(n+1)]] for i in range(tau+1, min(tau+n-1, T-1)+1)]
                    )
                
                # calculate return
                G = np.sum([(gamma**(i-tau-1))*stored_rewards[i%(n+1)] for i in range(tau+1, min(tau+n, T)+1)])
                
                
                if tau+n < T:
                    expected_sarsa_update = np.sum(
                        [target_policy(stored_states[(tau+n) % (n+1)])[a] * Q[stored_states[(tau+n) % (n+1)]][a] for a in range(nA)]
                    )
                    G += (gamma**n) * expected_sarsa_update
                    
                s_tau, a_tau = stored_states[tau % (n+1)], stored_actions[tau % (n+1)]
                
                td_error = G - Q[s_tau][a_tau]
                Q[s_tau][a_tau] += alpha * rho * td_error
    return Q, stats


if __name__=='__main__':
    Q, stats = n_step_expected_sarsa(env, num_episodes=300)
    plots.plot_episode_stats(stats, file='results/n_step_off_policy_expected_sarsa/')
Ejemplo n.º 3
0
            next_state, reward, done, _ = env.step(action)
            
            next_action_probs = policy(next_state)
            
            next_action = np.random.choice(env.action_space.n, p=next_action_probs)
            
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            td_error = reward + (discount_factor * Q[next_state][next_action]) - Q[state][action]
            E[state][action] += 1
            
            for s, _ in Q.items():
                for a_ in range(nA):
                    Q[s][a_] += alpha * td_error * E[s][a_]
                    E[s][a_] *= discount_factor * lambd
            
            if done:
                break
            
            state = next_state
            action = next_action
    
    return Q, stats


if __name__=='__main__':
    Q, stats = sarsa_lambd(env, 300)
    plots.plot_episode_stats(stats, file='results/sarsa_lambda/')
                state = next_state
                
            for t, transition in enumerate(trajectory):
                # get total reward
                total_return = sum(self.gamma**i * tr.reward for i, tr in enumerate(trajectory[t:]))
                
                # get value_estimate
                value_estimate = self.value_estimator.predict(transition.state).detach()
                
                advantage = torch.FloatTensor([total_return]) - value_estimate
                advantage = torch.FloatTensor([advantage])
                
                # update value estimator
                self.value_estimator.update(transition.state, 
                                            torch.FloatTensor([total_return]), 
                                            self.value_optimizer)
                
                # update policy estimator
                action = torch.LongTensor([transition.action])
                self.policy_estimator.update(transition.state, 
                                             advantage, 
                                             action, 
                                             self.policy_optimizer)
                
        return stats

if __name__=="__main__":
    agent = ReinforceBaselineAgent(env.observation_space.n, action_size, 2000)
    stats = agent.train()
    plots.plot_episode_stats(stats, smoothing_window=25, file='results/pytorch_reinforce/')
Ejemplo n.º 5
0
            # sample action from behavior policy
            action_probs = policy(state)
            action = np.random.choice(env.action_space.n, p=action_probs)

            # take action and observe environment's effects
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # sample next action from target policy
            next_action = np.argmax(Q[next_state])

            td_target = reward + discount_factor * Q[next_state][next_action]

            # update Q value
            Q[state][action] += alpha * (td_target - Q[state][action])

            if done:
                break

            state = next_state

    return Q, stats


if __name__ == '__main__':
    Q, stats = q_learning(env, num_episodes=300)
    plots.plot_episode_stats(stats, file='results/Q_learning/')
Ejemplo n.º 6
0
                                                                 (n + 1)]]

                for k in range(min(t + 1, T), tau, -1):
                    if k == T:
                        G = stored_rewards[T % (n + 1)]
                    else:
                        s_k = stored_states[k % (n + 1)]
                        a_k = stored_actions[k % (n + 1)]
                        r_k = stored_rewards[k % (n + 1)]
                        sigma_k = stored_sigma[k % (n + 1)]
                        rho_k = stored_rho[k % (n + 1)]
                        v_ = np.sum([(target_policy(s_k)[a]) * Q[s_k][a]
                                     for a in range(nA)])
                        G = r_k + gamma * ((sigma_k * rho_k) +
                                           ((1 - sigma_k) *
                                            (target_policy(s_k)[a_k]))) * (
                                                G - Q[s_k][a_k]) + gamma * v_

                s_tau, a_tau = stored_states[tau %
                                             (n + 1)], stored_actions[tau %
                                                                      (n + 1)]
                td_error = G - Q[s_tau][a_tau]
                Q[s_tau][a_tau] += alpha * td_error

    return Q, stats


if __name__ == '__main__':
    Q, stats = q_sigma(env, num_episodes=300)
    plots.plot_episode_stats(stats, file='results/n_step_q_sigma/')
Ejemplo n.º 7
0
            state = next_state

    return stats


if __name__ == "__main__":

    estimator = Estimator()

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    stats = sarsa(env, estimator, 200, epsilon=0.0)

    plots.plot_cost_to_go_mountain_car(env, estimator, file='results/sarsa/')
    plots.plot_episode_stats(stats, smoothing_window=25, file='results/sarsa/')

    # uncomment to render
    # for i_episode in range(20):
    #     print(i_episode)
    #     policy = make_epsilon_greedy_policy(
    #             estimator, 0.0, env.action_space.n)
    #     observation = env.reset()
    #     for t in itertools.count():
    #         env.render()
    #         action = np.argmax(policy(observation))
    #         observation, reward, done, info = env.step(action)
    #         if done:
    #             print("Episode finished after {} timesteps".format(t+1))
    #             break
    # env.close()
Ejemplo n.º 8
0
            state = next_state
    return stats


if __name__=="__main__":
    
    estimator = Estimator()

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    stats = q_learning(env, estimator, 200, epsilon=0.0)


    plots.plot_cost_to_go_mountain_car(env, estimator, file='results/q_learning/')
    plots.plot_episode_stats(stats, smoothing_window=25, file='results/q_learning/')


    # uncomment to render
    # for i_episode in range(20):
    #     print(i_episode)
    #     policy = make_epsilon_greedy_policy(
    #             estimator, 0.0, env.action_space.n)
    #     observation = env.reset()
    #     for t in itertools.count():
    #         env.render()
    #         action = np.argmax(policy(observation))
    #         observation, reward, done, info = env.step(action)
    #         if done:
    #             print("Episode finished after {} timesteps".format(t+1))
    #             break
Ejemplo n.º 9
0
                    s_t1 = stored_states[(t + 1) % (n + 1)]
                    # calulate sum of the leaf actions
                    leaf_sum = np.sum([(target_policy(s_t1)[a]) * Q[s_t1][a]
                                       for a in range(env.nA)])
                    G = stored_rewards[(t + 1) % (n + 1)] + gamma * leaf_sum

                for k in range(min(t, T - 1), tau, -1):
                    # get kth action and state
                    s_k, a_k = stored_states[k %
                                             (n + 1)], stored_actions[k %
                                                                      (n + 1)]
                    a_probs = np.sum([
                        target_policy(s_k)[a] * Q[s_k][a] for a in range(nA)
                        if a != a_k
                    ])
                    G = stored_rewards[k % (n + 1)] + gamma * (
                        a_probs + target_policy(s_k)[a_k] * G)

                s_tau, a_tau = stored_states[tau %
                                             (n + 1)], stored_actions[tau %
                                                                      (n + 1)]
                td_error = G - Q[s_tau][a_tau]
                Q[s_tau][a_tau] += alpha * td_error

    return Q, stats


if __name__ == '__main__':
    Q, stats = n_step_tree_backup(env, num_episodes=300)
    plots.plot_episode_stats(stats, file='results/n_step_tree_backup/')
Ejemplo n.º 10
0
            # take action and observe environment's effects
            next_state, reward, done, _ = env.step(action)
            
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # sample next action from target policy
            next_action = np.argmax(Q[next_state])
            
            td_target = reward + discount_factor * ((1-epsilon)*Q[next_state][next_action]+(epsilon/nA)*np.sum([Q[next_state][a] for a in range(nA)]))
            
            # update Q value
            Q[state][action] += alpha * (td_target - Q[state][action])
            
            
            if done: 
                break
            
            state = next_state
    
    return Q, stats


if __name__=='__main__':
    Q, stats = expected_sarsa_off_policy(env, num_episodes=300)
    plots.plot_episode_stats(stats, file='results/expected_sarsa_off_policy/')