Exemple #1
0
def sample_from_buffers(buffer_one, buffer_two=None):
    """
    Pick from one of buffer one and buffer two according to the buffer probability parameter.
    Then samples uniformly at random from the chosen buffer.
    """
    #NOTE: Will have to add a check here to force sampling from non empty buffer if I decided to not wait for nonempty buffers of both types in reward/state tasks
    if buffer_two is None or rand_un(
    ) <= a_globs.BUFFER_SAMPLE_BIAS_PROBABILITY:
        cur_observation = buffer_one[rand_in_range(len(buffer_one))]
    else:
        cur_observation = buffer_two[rand_in_range(len(buffer_two))]

    return cur_observation
Exemple #2
0
def agent_step(reward, state):

    next_state = state

    #Choose the next action, epsilon greedy style
    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        next_action = get_max_action_tabular(next_state)
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    #Update the state action values
    if not a_globs.is_trial_episode:
        next_state_max_action = a_globs.state_action_values[next_state[0]][
            next_state[1]].index(
                max(a_globs.state_action_values[next_state[0]][next_state[1]]))
        a_globs.state_action_values[a_globs.cur_state[0]][
            a_globs.cur_state[1]][a_globs.cur_action] += a_globs.ALPHA * (
                reward + a_globs.GAMMA * a_globs.state_action_values[
                    next_state[0]][next_state[1]][next_state_max_action] -
                a_globs.state_action_values[a_globs.cur_state[0]][
                    a_globs.cur_state[1]][a_globs.cur_action])

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    return next_action
Exemple #3
0
def agent_start(state):

    a_globs.cur_state = state

    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        a_globs.cur_action = get_max_action_tabular(a_globs.cur_state)
    else:
        a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS)
    return a_globs.cur_action
Exemple #4
0
def agent_start(state):

    #Context is a sliding window of the previous n states that gets added to the replay buffer used by auxiliary tasks
    a_globs.cur_context = []
    a_globs.cur_context_actions = []
    a_globs.cur_state = state

    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        a_globs.cur_action = get_max_action(a_globs.cur_state)
    else:
        a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS)
    return a_globs.cur_action
Exemple #5
0
def get_max_action_tabular(state):
    "Return the maximum action to take given the current state."

    #Need to ensure that an action is picked uniformly at random from among those that tie for maximum
    cur_max = a_globs.state_action_values[state[0]][state[1]][0]
    max_indices = [0]
    for i in range(1, len(a_globs.state_action_values[state[0]][state[1]])):
        if a_globs.state_action_values[state[0]][state[1]][i] > cur_max:
            cur_max = a_globs.state_action_values[state[0]][state[1]][i]
            max_indices = [i]
        elif a_globs.state_action_values[state[0]][state[1]][i] == cur_max:
            max_indices.append(i)
    next_action = max_indices[rand_in_range(len(max_indices))]
    return next_action
def agent_start(state):
    a_globs.cur_state = state

    #Choose the next action, epislon-greedy style
    if rand_un() < 1 - a_globs.cur_epsilon:
        actions = [
            approx_value(a_globs.cur_state, action, a_globs.weights)[0]
            for action in range(a_globs.NUM_ACTIONS)
        ]
        a_globs.cur_action = actions.index(max(actions))
    else:
        a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS)

    return a_globs.cur_action
def agent_step(reward, state):

    next_state = state

    #Update delta and the eligibility trace
    delta = reward
    _, a_globs.cur_state_feature_indices = approx_value(
        a_globs.cur_state, a_globs.cur_action, a_globs.weights)
    for index in a_globs.cur_state_feature_indices:
        delta = delta - a_globs.weights[0][index]
        a_globs.e_trace[0][index] = 1

    #Choose the next action, epislon-greedy style
    if rand_un() < 1 - a_globs.cur_epsilon:
        actions = [
            approx_value(a_globs.cur_state, action, a_globs.weights)[0]
            for action in range(a_globs.NUM_ACTIONS)
        ]
        next_action = actions.index(max(actions))
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    #Update the a_globs.weights
    _, next_state_feature_indices = approx_value(next_state, next_action,
                                                 a_globs.weights)
    for index in next_state_feature_indices:
        delta = delta + a_globs.GAMMA * a_globs.weights[0][index]
    a_globs.weights += (a_globs.ALPHA /
                        a_globs.NUM_TILINGS) * delta * a_globs.e_trace
    a_globs.e_trace = a_globs.GAMMA * a_globs.TRACE * a_globs.e_trace

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    # print(state)
    # print(reward)
    # print(next_action)
    return a_globs.cur_action
Exemple #8
0
def agent_step(reward, state):

    next_state = state

    update_replay_buffer(a_globs.cur_state, a_globs.cur_action, reward,
                         next_state)
    next_state_formatted = format_states([next_state])

    #Choose the next action, epsilon greedy style
    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a))
        q_vals = get_q_vals_aux(next_state, False)
        next_action = np.argmax(q_vals)
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    do_auxiliary_learning(a_globs.cur_state, next_state, reward)

    if RL_num_steps() % a_globs.NUM_STEPS_TO_UPDATE == 0:
        update_target_network()

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    return next_action
Exemple #9
0
def agent_step(reward, state):

    return rand_in_range(a_globs.NUM_ACTIONS)
Exemple #10
0
def agent_start(state):

    return rand_in_range(a_globs.NUM_ACTIONS)
Exemple #11
0
def agent_step(reward, state):

    next_state = state
    next_state_formatted = format_states([next_state])
    if not a_globs.is_trial_episode:
        update_replay_buffer(a_globs.cur_state, a_globs.cur_action, reward,
                             next_state)

    #Choose the next action, epsilon greedy style
    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        #Get the best action over all actions possible in the next state, max_a(Q(s + 1), a))
        q_vals = a_globs.model.predict(next_state_formatted, batch_size=1)
        next_action = np.argmax(q_vals)
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    #Get the target value for the update from the target network
    q_vals = a_globs.target_network.predict(next_state_formatted, batch_size=1)
    cur_action_target = reward + a_globs.GAMMA * np.max(q_vals)

    #Get the value for the current state of the action which was just taken, ie Q(S, A),
    #and set the target for the specifc action taken (we need to pass in the
    #whole vector of q_values, since our network takes state only as input)
    cur_state_formatted = format_states([a_globs.cur_state])
    q_vals = a_globs.model.predict(cur_state_formatted, batch_size=1)

    q_vals[0][a_globs.cur_action] = cur_action_target

    #Check and see if the relevant buffer is non-empty
    if buffers_are_ready(a_globs.buffer_container,
                         a_globs.BUFFER_SIZE) and not a_globs.is_trial_episode:

        if (a_globs.is_trial_episode):
            exit("BAD!")
        buffer_states = [
            observation.states for observation in a_globs.buffer_container[0]
        ]
        #print(buffer_states)

        #Create the target training batch
        batch_inputs = np.empty(shape=(
            a_globs.BATCH_SIZE,
            a_globs.FEATURE_VECTOR_SIZE,
        ))
        batch_targets = np.empty(shape=(a_globs.BATCH_SIZE,
                                        a_globs.NUM_ACTIONS))

        #Add the current observation to the mini-batch
        batch_inputs[0] = cur_state_formatted
        batch_targets[0] = q_vals

        #Use the replay buffer to learn from previously visited states
        for i in range(1, a_globs.BATCH_SIZE):
            cur_observation = do_buffer_sampling()

            #NOTE: For now If N > 1 we only want the most recent state associated with the reward and next state (effectively setting N > 1 changes nothing right now since we want to use the same input type as in the regular singel task case)

            most_recent_obs_state = cur_observation.states[-1]
            sampled_state_formatted = format_states([most_recent_obs_state])
            sampled_next_state_formatted = format_states(
                [cur_observation.next_state])

            #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a))
            q_vals = a_globs.target_network.predict(
                sampled_next_state_formatted, batch_size=1)
            cur_action_target = reward + (a_globs.GAMMA * np.max(q_vals))

            #Get the q_vals to adjust the learning target for the current action taken
            q_vals = a_globs.model.predict(sampled_state_formatted,
                                           batch_size=1)
            q_vals[0][a_globs.cur_action] = cur_action_target

            batch_inputs[i] = sampled_state_formatted
            batch_targets[i] = q_vals

        #Update the weights using the sampled batch
        if not a_globs.is_trial_episode:
            a_globs.model.fit(batch_inputs,
                              batch_targets,
                              batch_size=a_globs.BATCH_SIZE,
                              epochs=1,
                              verbose=0)

    if RL_num_steps(
    ) % a_globs.NUM_STEPS_TO_UPDATE == 0 and not a_globs.is_trial_episode:
        update_target_network()

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    return next_action