Ejemplo n.º 1
0
    def train(self, nb_episode, nb_simulation):
        s = environment.State()
        likelihood_list = np.zeros(nb_episode)
        reward_list = np.zeros(nb_episode)
        if self.model is not None:
            self.model.reset_observation()

        for episode in range(nb_episode):
            old_s1, old_s2 = self.env.reset()
            nb_step = 0
            # old_s1, old_s2 = 0, 0
            done = False
            while not done:
                s, reward, done = self._learn(old_s1, old_s2)
                # print(f'{old_s1}, {old_s2}, {s.s1}, {s.s2}')

                # simulation
                if self.model is not None:
                    for _ in range(nb_simulation):
                        self.simulate()

                real_likelihood = self.env.get_likelihood(
                    s.old_s1, s.old_s2, s.s1, s.s2, s.a)
                l = self.model.likelihood.get_likelihood(s)
                # likelihood_list[episode] += np.abs(real_likelihood - l.detach().numpy())
                likelihood_list[episode] += l.detach().numpy()
                reward_list[episode] += reward
                old_s1, old_s2 = s.s1, s.s2
                nb_step += 1
            likelihood_list[episode] /= nb_step

        return likelihood_list, reward_list
Ejemplo n.º 2
0
def monte_carlo(iterations=1000000,
                policy=policies.epsilon_greedy,
                n_zero=100):
    """ Performs Monte Carlo control in the Easy21 game.

    :param iterations: number of monte carlo iterations
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :return: value function and the plot of the optimal value function
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # number of wins
    wins = 0

    print('Iterations completed:')
    for i in xrange(iterations):

        if (i % 500000) == 0:
            print(i)

        # create a new random starting state
        state = environment.State()
        # play a round
        observed_keys = []
        while not state.terminal:
            player = state.player_sum
            dealer = state.dealer_first_card

            # find an action defined by the policy
            epsilon = n_zero / float(n_zero + counter_state[(player, dealer)])
            action = policy(epsilon, value_function, state)
            observed_keys.append((player, dealer, action))

            # take a step
            [state, reward] = environment.step(state, action)

        # we have reached an end of episode
        if reward is not None:
            # update over all keys
            for key in observed_keys:
                # update counts
                counter_state[key[:-1]] += 1
                counter_state_action[key] += 1

                # update value function
                alpha = 1.0 / counter_state_action[key]
                value_function[key] += alpha * (reward - value_function[key])

        if reward == 1:
            wins += 1

    print('Wins: %.4f%%' % ((float(wins) / iterations) * 100))
    # plot the optimal value function
    plotting.plot_value_function(value_function)
    return value_function
Ejemplo n.º 3
0
def train_with_buffer(env, likelihood, nb_episode):
    s = environment.State()
    nb_total_step = nb_episode * env.max_step
    buffer = []
    l_a2b = np.zeros(nb_total_step)
    l_b2a = np.zeros(nb_total_step)

    # fill buffer
    for episode in range(nb_episode):
        done = False
        old_s1, old_s2 = env.reset()
        while not done:
            a = env.sample_action_uniformly()
            (s1, s2), reward, done, _ = env.step(a, old_s1, old_s2)
            s.set_state(old_s1, old_s2, s1, s2, a)
            buffer.append(copy.copy(s))
            old_s1, old_s2 = s1, s2

    shuffle(buffer)

    # train
    for i in range(nb_total_step):
        s = buffer[i]
        l1, l2 = likelihood.update(s)
        real_likelihood = env.get_likelihood(s.old_s1, s.old_s2, s.s1, s.s2,
                                             s.a)
        l_a2b[i] = np.abs(real_likelihood - l1.detach().numpy())
        l_b2a[i] = np.abs(real_likelihood - l2.detach().numpy())

        # l_a2b[i] = l1.detach().numpy()
        # l_b2a[i] = l2.detach().numpy()

    return l_a2b, l_b2a
Ejemplo n.º 4
0
def calc_features_matrix():
    all_features = np.zeros(
        (10, 21, 3, 6))  # dealer card - 1, players sum - 1, features 3x6
    for i in range(10):
        for j in range(21):
            state = environment.State(i + 1, j + 1)
            all_features[i, j, :, :] = state.get_features()

    return all_features
Ejemplo n.º 5
0
    def _learn(self, old_s1, old_s2):
        s = environment.State()
        a = self.q_learning.sample_action(old_s1, old_s2)
        (s1, s2), reward, done, _ = self.env.step(a, old_s1, old_s2)
        s.set_state(old_s1, old_s2, s1, s2, a)

        self.q_learning.update(s, reward)
        self.model.update(s, reward)
        return s, reward, done
Ejemplo n.º 6
0
 def simulate(self):
     s = environment.State()
     old_s1, old_s2, a = self.model.sample_observation()
     s1, s2, reward, confidence_level = self.model.simulate(
         old_s1, old_s2, a, self.env)
     #TODO: remove self.env
     s.set_state(old_s1, old_s2, s1, s2, a)
     if confidence_level > self.confidence_threshold:
         self.q_learning.update(s, reward)
Ejemplo n.º 7
0
def q_test() -> bool:
    environment_ = environment.Environment(grid_=data.GRID_1, rng=rng)
    q = StateActionFunction(environment_)

    state_ = environment.State(common.XY(x=4, y=2))
    action_ = environment.Action(common.XY(x=1, y=0))
    print(q[state_, action_])
    q[state_, action_] = 2.0
    q[state_, action_] += 0.5
    print(q[state_, action_])

    return True
Ejemplo n.º 8
0
 def plot_q(self):
     (x, y, z) = ([], [], [])
     for d in range(1, 10):
         for p in range(1, 21):
             state = environment.State(
                 environment.Card(environment.COLOR_BLACK, d), p)
             x.append(float(d))
             y.append(float(p))
             value = max(
                 self.evaluate_model(state, environment.ACTION_HIT),
                 self.evaluate_model(state, environment.ACTION_STICK))
             z.append(value)
     fig = plt.figure()
     ax = fig.gca(projection='3d')
     ax.scatter(np.array(x),
                np.array(y),
                np.array(z),
                linewidth=1,
                antialiased=False)
     plt.show()
Ejemplo n.º 9
0
def environment_test() -> bool:
    environment_ = environment.Environment(grid_=data.GRID_1, rng=rng)

    for state_ in environment_.states():
        print(state_)

    print()

    for action_ in environment_.actions():
        print(action_)

    print()

    state_ = environment.State(common.XY(x=4, y=2))
    action_ = environment.Action(common.XY(x=1, y=0))
    response_ = environment_.from_state_perform_action(state_, action_)
    print(state_, action_)
    print(response_)

    return True
Ejemplo n.º 10
0
def train(env, likelihood, nb_episode):
    s = environment.State()
    l_a2b = np.zeros(nb_episode)
    l_b2a = np.zeros(nb_episode)

    for episode in range(nb_episode):
        done = False
        old_s1, old_s2 = env.reset()
        while not done:
            a = env.sample_action_uniformly()
            (s1, s2), reward, done, _ = env.step(a, old_s1, old_s2)
            s.set_state(old_s1, old_s2, s1, s2, a)
            l1, l2 = likelihood.update(s)
            real_likelihood = env.get_likelihood(old_s1, old_s2, s1, s2, a)
            l_a2b[episode] += np.abs(real_likelihood - l1.detach().numpy())
            l_b2a[episode] += np.abs(real_likelihood - l2.detach().numpy())
            old_s1, old_s2 = s1, s2

        l_a2b[episode] /= 100
        l_b2a[episode] /= 100
    return l_a2b, l_b2a
Ejemplo n.º 11
0
    def train_with_buffer(self, nb_episode, nb_simulation, buffer_size=None):
        s = environment.State()
        if self.model is not None:
            self.model.reset_observation()

        if buffer_size is None:
            buffer_size = int(nb_episode / 10)
            print(f'buffer_size: {buffer_size}')
        buffer = self._fill_buffer(buffer_size)

        likelihood_list = np.zeros(nb_episode)
        reward_list = np.zeros(nb_episode)
        episode = 0

        for _ in range(10):
            for i in range(buffer_size):
                s = buffer[i]
                new = np.random.randint(10)
                if new < 1:
                    old_s1, old_s2 = self.env.reset()
                else:
                    old_s1, old_s2 = s.old_s1, s.old_s2
                s, reward, done = self._learn(s.old_s1, s.old_s2)
                buffer[i] = s

                if self.model is not None:
                    for _ in range(nb_simulation):
                        self.simulate()

                real_likelihood = self.env.get_likelihood(
                    s.old_s1, s.old_s2, s.s1, s.s2, s.a)
                l = self.model.likelihood.get_likelihood(s)
                likelihood_list[episode] = np.abs(real_likelihood -
                                                  l.detach().numpy())
                reward_list[episode] = reward
                episode += 1
            shuffle(buffer)

        return likelihood_list, reward_list
Ejemplo n.º 12
0
    def simulate(self, old_s1, old_s2, a, env):
        self.simulation_total += 1
        prob = np.zeros((self.state_dim * self.state_dim))
        state = environment.State()
        #TODO: keep values for same step...
        for s1 in range(self.state_dim):
            for s2 in range(self.state_dim):
                state.set_state(old_s1, old_s2, s1, s2, a)
                l = self.likelihood.get_likelihood(state).detach().numpy()
                prob[s1 + s2 * self.state_dim] = np.exp(l)

        if np.sum(prob) != 1:
            prob = prob / np.sum(prob)
        s = np.random.choice(np.arange(prob.shape[0]), p=prob)
        s1 = s % self.state_dim
        s2 = s // self.state_dim
        confidence_level = np.amax(prob)
        if confidence_level > 0.5:
            print(f'Prob max:{np.amax(prob)}')
            print(f'simulation result: ({s1}, {s2})')
            print(env.step(a, old_s1, old_s2))
            # __import__('ipdb').set_trace()
        reward = self.sample_reward(s1, s2, a)
        return s1, s2, reward, confidence_level
Ejemplo n.º 13
0
def sarsa_episode(lam):
    # reset all eligibility traces
    state_info[:, :, e_hit_index] = 0
    state_info[:, :, e_stick_index] = 0

    # initialize the state S
    dealer_card = random.randint(1, 10)
    player_card = random.randint(1, 10)
    state = environment.State(dealer_card, player_card)
    features = state.get_features()

    # initialize the action A
    action = environment.Action.HIT
    if random.random() < 0.5:
        action = environment.Action.STICK

    # run one episode
    while not state.terminated:
        # take the action A
        state_new = environment.step(state, action)
        reward = state_new.reward
        features_new = state_new.get_features()

        # pick the next action A' by using epsilon greedy
        action_new = None
        if state_new.terminated:
            action_new = environment.Action.NONE

        else:
            if random.random() < epsilon:
                # exploration, pick a random action
                if random.random() < 0.5:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

            else:
                # pick the action greedily (largest action value)
                v_hit = np.sum(
                    np.multiply(features_new, state_info[:, :, q_hit_index]))
                v_stick = np.sum(
                    np.multiply(features_new, state_info[:, :, q_stick_index]))
                if v_hit > v_stick:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

        # calculate delta
        if action == environment.Action.HIT:
            q_value = np.sum(
                np.multiply(features, state_info[:, :, q_hit_index]))
        else:
            q_value = np.sum(
                np.multiply(features, state_info[:, :, q_stick_index]))

        if state_new.terminated:
            q_value_new = 0

        else:
            if action_new == environment.Action.HIT:
                q_value_new = np.sum(
                    np.multiply(features_new, state_info[:, :, q_hit_index]))
            else:
                q_value_new = np.sum(
                    np.multiply(features_new, state_info[:, :, q_stick_index]))

        delta = reward + q_value_new - q_value

        # increment eligibility trace
        if action == environment.Action.HIT:
            state_info[:, :, e_hit_index] += features
        else:
            state_info[:, :, e_stick_index] += features

        # update all values
        state_info[:, :,
                   q_hit_index] += alpha * delta * state_info[:, :,
                                                              e_hit_index]
        state_info[:, :,
                   q_stick_index] += alpha * delta * state_info[:, :,
                                                                e_stick_index]

        # update all eligibility traces
        state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index]
        state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index]

        # end this step
        state = state_new
        action = action_new
        features = features_new
Ejemplo n.º 14
0
def mc_episode():
    states = []  # holds all states of one episode
    actions = []  # holds all actions of one episode

    # create the initial state
    dealer_card = random.randint(1, 10)
    player_card = random.randint(1, 10)
    state = environment.State(dealer_card, player_card)

    # create the initial state
    while not state.terminated:
        states.append(state)

        # define the indices for the state matrix
        dealer_state_index = state.dealer_card - 1
        player_state_index = state.player_sum - 1

        # pick the action
        epsilon = n0 / (
            n0 + state_info[dealer_state_index, player_state_index, ns_index])
        if random.random() < epsilon:
            # exploration, pick a random action
            if random.random() < 0.5:
                action = environment.Action.HIT
            else:
                action = environment.Action.STICK

        else:
            # pick the action greedily (largest action value)
            if state_info[dealer_state_index, player_state_index,
                          q_hit_index] > state_info[dealer_state_index,
                                                    player_state_index,
                                                    q_stick_index]:
                action = environment.Action.HIT
            else:
                action = environment.Action.STICK

        # increment the counts
        state_info[dealer_state_index, player_state_index, ns_index] += 1

        if action == environment.Action.HIT:
            state_info[dealer_state_index, player_state_index,
                       ns_hit_index] += 1

        if action == environment.Action.STICK:
            state_info[dealer_state_index, player_state_index,
                       ns_stick_index] += 1

        # get a new state
        actions.append(action)
        state = environment.step(state, action)

    # update the action values
    for i in range(0, len(states)):
        s = states[i]
        a = actions[i]
        tot_reward = state.reward

        if not s.is_busted:
            dealer_state_index = s.dealer_card - 1
            player_state_index = s.player_sum - 1

            if a == environment.Action.HIT:
                alpha = 1 / state_info[dealer_state_index, player_state_index,
                                       ns_hit_index]
                value = state_info[dealer_state_index, player_state_index,
                                   q_hit_index]
                state_info[dealer_state_index, player_state_index,
                           q_hit_index] += alpha * (tot_reward - value)
            else:
                alpha = 1 / state_info[dealer_state_index, player_state_index,
                                       ns_stick_index]
                value = state_info[dealer_state_index, player_state_index,
                                   q_stick_index]
                state_info[dealer_state_index, player_state_index,
                           q_stick_index] += alpha * (tot_reward - value)
Ejemplo n.º 15
0
def linear_function_approximation(l=0.9,
                                  max_episodes=1000,
                                  policy=policies.epsilon_greedy_lfa,
                                  n_zero=100,
                                  gamma=1,
                                  plot_learning_curve=True,
                                  multiproc=True):
    """ Value function approximation using coarse coding

    :param l: lambda parameter
    :param gamma: discounting rate
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # weights vector for the state_action feature vector
    theta = np.random.random(36) * 0.2
    # random move probability
    epsilon = 0.05
    # step-size parameter
    alpha = 0.01

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle",
                                         "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # key is state_action feature vector
        eligibility_trace = np.zeros(36)

        # initial state, action [SA..], and set of features
        state = environment.State()
        # calculate features for the given state
        state_features_current = utilities.get_state_features(state)
        # get action from this state
        q_a_current, action_current = policy(epsilon, theta,
                                             state_features_current)
        # calculate final state, action feature vector
        features_current = utilities.get_state_action_features(
            state_features_current, action_current)

        while not state.terminal:

            # update eligibility trace (accumulating)
            eligibility_trace = np.add(eligibility_trace, features_current)

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            state_features_next = utilities.get_state_features(state)
            q_a_next, action_next = policy(epsilon, theta, state_features_next)
            features_next = utilities.get_state_action_features(
                state_features_next, action_next)

            # calculate state value difference
            delta = reward + gamma * q_a_next - q_a_current
            # update weights
            theta = np.add(theta, alpha * delta * eligibility_trace)
            # update trace
            eligibility_trace *= gamma * l

            features_current = features_next
            action_current = action_next

        # calculate value function
        value_function = defaultdict(float)
        for player in xrange(1, 22):
            for dealer in xrange(1, 11):
                for action in [0, 1]:
                    s = environment.State(dealer, player)
                    phi = utilities.get_state_action_features(
                        utilities.get_state_features(s), action)
                    value_function[(s.player_sum, s.dealer_first_card,
                                    action)] = phi.dot(theta)

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append(
                (episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curves
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve,
                        args=(
                            learning_curve,
                            l,
                        ))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    return value_function
Ejemplo n.º 16
0
def sarsa_lambda(l=0.9,
                 max_episodes=1000,
                 policy=policies.epsilon_greedy,
                 n_zero=100,
                 gamma=1,
                 plot_learning_curve=True,
                 multiproc=True):
    """ Applies eligibility trace version of Sarsa to the game Easy21

    :param l: lambda parameter
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param gamma: discounting rate
    :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # no. of wins to calculate the percentage of wins at the end
    wins = 0

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle",
                                         "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # current (player, dealer, action)
        eligibility_trace = defaultdict(float)

        # initial state, action [SA..]
        state = environment.State()
        player_current = state.player_sum
        dealer_current = state.dealer_first_card
        epsilon = n_zero / float(n_zero + counter_state[
            (player_current, dealer_current)])
        action_current = policy(epsilon, value_function, state)

        while not state.terminal:

            # update counts
            counter_state[(player_current, dealer_current)] += 1
            counter_state_action[(player_current, dealer_current,
                                  action_current)] += 1

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            player_next = state.player_sum
            dealer_next = state.dealer_first_card
            epsilon = n_zero / float(n_zero +
                                     counter_state[(player_next, dealer_next)])
            action_next = policy(epsilon, value_function, state)

            delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \
                value_function[(player_current, dealer_current, action_current)]

            alpha = 1.0 / counter_state_action[(player_current, dealer_current,
                                                action_current)]

            eligibility_trace[(player_current, dealer_current,
                               action_current)] += 1

            # update the values
            for key in value_function:
                value_function[key] += alpha * delta * eligibility_trace[key]
                eligibility_trace[key] *= gamma * l

            player_current = player_next
            dealer_current = dealer_next
            action_current = action_next

        # use it later to calculate the percentage of wins
        if reward == 1:
            wins += 1

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append(
                (episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curve
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve,
                        args=(
                            learning_curve,
                            l,
                        ))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    # get the percentage of wins
    print float(wins) / max_episodes
    return value_function
Ejemplo n.º 17
0
def sarsa_episode(lam):
    """
    executes one sarsa episode
    :param lam:     the lambda parameter
    :return:
    """
    # reset all eligibility traces
    state_info[:, :, e_hit_index] = 0
    state_info[:, :, e_stick_index] = 0

    # initialize the state S
    dealer_card = random.randint(1, 10)
    player_card = random.randint(1, 10)
    state = environment.State(dealer_card, player_card)

    # initialize the action A
    action = environment.Action.HIT
    if random.random() < 0.5:
        action = environment.Action.STICK

    # run one episode
    while not state.terminated:
        # define the starting state indices for the state matrix
        dealer_state_index = state.dealer_card - 1
        player_state_index = state.player_sum - 1

        # take the action A
        state_new = environment.step(state, action)
        reward = state_new.reward

        # define the indices of the new state
        dealer_state_index_new = state_new.dealer_card - 1
        player_state_index_new = state_new.player_sum - 1

        # pick the next action A' by using epsilon greedy
        if state_new.terminated:
            action_new = environment.Action.NONE

        else:
            epsilon = n0 / (n0 + state_info[dealer_state_index_new,
                                            player_state_index_new, ns_index])
            if random.random() < epsilon:
                # exploration, pick a random action
                if random.random() < 0.5:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

            else:
                # pick the action greedily (largest action value)
                if state_info[dealer_state_index_new, player_state_index_new,
                              q_hit_index] > state_info[dealer_state_index_new,
                                                        player_state_index_new,
                                                        q_stick_index]:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

        # increment the counts
        state_info[dealer_state_index, player_state_index, ns_index] += 1

        if action == environment.Action.HIT:
            state_info[dealer_state_index, player_state_index,
                       ns_hit_index] += 1

        if action == environment.Action.STICK:
            state_info[dealer_state_index, player_state_index,
                       ns_stick_index] += 1

        # calculate delta
        if action == environment.Action.HIT:
            qValue = state_info[dealer_state_index, player_state_index,
                                q_hit_index]
        else:
            qValue = state_info[dealer_state_index, player_state_index,
                                q_stick_index]

        if state_new.terminated:
            q_value_new = 0

        else:
            if action_new == environment.Action.HIT:
                q_value_new = state_info[dealer_state_index_new,
                                         player_state_index_new, q_hit_index]
            else:
                q_value_new = state_info[dealer_state_index_new,
                                         player_state_index_new, q_stick_index]

        delta = reward + q_value_new - qValue

        # increment eligibility trace
        alpha = None
        if action == environment.Action.HIT:
            alpha = 1 / state_info[dealer_state_index, player_state_index,
                                   ns_hit_index]
            state_info[dealer_state_index, player_state_index,
                       e_hit_index] += 1
        else:
            alpha = 1 / state_info[dealer_state_index, player_state_index,
                                   ns_stick_index]
            state_info[dealer_state_index, player_state_index,
                       e_stick_index] += 1

        # update all values
        state_info[:, :,
                   q_hit_index] += alpha * delta * state_info[:, :,
                                                              e_hit_index]
        state_info[:, :,
                   q_stick_index] += alpha * delta * state_info[:, :,
                                                                e_stick_index]

        # update all eligibility traces
        state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index]
        state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index]

        # end this step
        state = state_new
        action = action_new