Ejemplo n.º 1
0
def agent_step(reward, state):

    next_state = state

    #Choose the next action, epsilon greedy style
    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        next_action = get_max_action_tabular(next_state)
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    #Update the state action values
    if not a_globs.is_trial_episode:
        next_state_max_action = a_globs.state_action_values[next_state[0]][
            next_state[1]].index(
                max(a_globs.state_action_values[next_state[0]][next_state[1]]))
        a_globs.state_action_values[a_globs.cur_state[0]][
            a_globs.cur_state[1]][a_globs.cur_action] += a_globs.ALPHA * (
                reward + a_globs.GAMMA * a_globs.state_action_values[
                    next_state[0]][next_state[1]][next_state_max_action] -
                a_globs.state_action_values[a_globs.cur_state[0]][
                    a_globs.cur_state[1]][a_globs.cur_action])

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    return next_action
Ejemplo n.º 2
0
def agent_start(state):

    a_globs.cur_state = state

    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        a_globs.cur_action = get_max_action_tabular(a_globs.cur_state)
    else:
        a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS)
    return a_globs.cur_action
Ejemplo n.º 3
0
def agent_start(state):

    #Context is a sliding window of the previous n states that gets added to the replay buffer used by auxiliary tasks
    a_globs.cur_context = []
    a_globs.cur_context_actions = []
    a_globs.cur_state = state

    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        a_globs.cur_action = get_max_action(a_globs.cur_state)
    else:
        a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS)
    return a_globs.cur_action
Ejemplo n.º 4
0
def sample_from_buffers(buffer_one, buffer_two=None):
    """
    Pick from one of buffer one and buffer two according to the buffer probability parameter.
    Then samples uniformly at random from the chosen buffer.
    """
    #NOTE: Will have to add a check here to force sampling from non empty buffer if I decided to not wait for nonempty buffers of both types in reward/state tasks
    if buffer_two is None or rand_un(
    ) <= a_globs.BUFFER_SAMPLE_BIAS_PROBABILITY:
        cur_observation = buffer_one[rand_in_range(len(buffer_one))]
    else:
        cur_observation = buffer_two[rand_in_range(len(buffer_two))]

    return cur_observation
Ejemplo n.º 5
0
def agent_start(state):
    a_globs.cur_state = state

    #Choose the next action, epislon-greedy style
    if rand_un() < 1 - a_globs.cur_epsilon:
        actions = [
            approx_value(a_globs.cur_state, action, a_globs.weights)[0]
            for action in range(a_globs.NUM_ACTIONS)
        ]
        a_globs.cur_action = actions.index(max(actions))
    else:
        a_globs.cur_action = rand_in_range(a_globs.NUM_ACTIONS)

    return a_globs.cur_action
Ejemplo n.º 6
0
def agent_step(reward, state):

    next_state = state

    #Update delta and the eligibility trace
    delta = reward
    _, a_globs.cur_state_feature_indices = approx_value(
        a_globs.cur_state, a_globs.cur_action, a_globs.weights)
    for index in a_globs.cur_state_feature_indices:
        delta = delta - a_globs.weights[0][index]
        a_globs.e_trace[0][index] = 1

    #Choose the next action, epislon-greedy style
    if rand_un() < 1 - a_globs.cur_epsilon:
        actions = [
            approx_value(a_globs.cur_state, action, a_globs.weights)[0]
            for action in range(a_globs.NUM_ACTIONS)
        ]
        next_action = actions.index(max(actions))
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    #Update the a_globs.weights
    _, next_state_feature_indices = approx_value(next_state, next_action,
                                                 a_globs.weights)
    for index in next_state_feature_indices:
        delta = delta + a_globs.GAMMA * a_globs.weights[0][index]
    a_globs.weights += (a_globs.ALPHA /
                        a_globs.NUM_TILINGS) * delta * a_globs.e_trace
    a_globs.e_trace = a_globs.GAMMA * a_globs.TRACE * a_globs.e_trace

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    # print(state)
    # print(reward)
    # print(next_action)
    return a_globs.cur_action
Ejemplo n.º 7
0
def agent_step(reward, state):

    next_state = state

    update_replay_buffer(a_globs.cur_state, a_globs.cur_action, reward,
                         next_state)
    next_state_formatted = format_states([next_state])

    #Choose the next action, epsilon greedy style
    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a))
        q_vals = get_q_vals_aux(next_state, False)
        next_action = np.argmax(q_vals)
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    do_auxiliary_learning(a_globs.cur_state, next_state, reward)

    if RL_num_steps() % a_globs.NUM_STEPS_TO_UPDATE == 0:
        update_target_network()

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    return next_action
Ejemplo n.º 8
0
def do_auxiliary_learning(cur_state, next_state, reward):
    "Update the weights for the auxiliary network based on both the current interaction with the environment and sampling from experience replay"

    is_verbose = 0

    #Perform direct learning on the current state and auxiliary information
    q_vals = get_q_vals_aux(cur_state, False)
    if next_state:
        #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a))
        q_vals_next = get_q_vals_aux(next_state, True)
        cur_action_target = reward + (a_globs.GAMMA * np.max(q_vals_next))
        q_vals[0][a_globs.cur_action] = cur_action_target

    else:
        q_vals[0][a_globs.cur_action] = reward

    if a_globs.AGENT == a_globs.REWARD:
        #We make the rewards positive since we care only about the binary
        #distinction between zero and non zero rewards and theano binary
        #cross entropy loss requires targets to be 0 or 1
        aux_target = np.array([[reward]])
    elif a_globs.AGENT == a_globs.STATE:
        if next_state:
            aux_target = format_states([next_state])
        else:
            aux_target = np.zeros(shape=(
                1,
                a_globs.FEATURE_VECTOR_SIZE,
            ))
    elif a_globs.AGENT == a_globs.NOISE:
        aux_target = np.array([
            rand_un() for i in range(a_globs.NUM_NOISE_NODES)
        ]).reshape(1, a_globs.NUM_NOISE_NODES)
    elif a_globs.AGENT == a_globs.REDUNDANT:
        nested_target = [q_vals for i in range(a_globs.NUM_REDUNDANT_TASKS)]
        aux_target = np.array([
            item for sublist in nested_target for item in sublist
        ]).reshape(1, a_globs.NUM_ACTIONS * a_globs.NUM_REDUNDANT_TASKS)

    cur_state_formatted = format_states([cur_state])

    #Check and see if the relevant buffer is non-empty
    if buffers_are_ready(a_globs.buffer_container,
                         a_globs.BUFFER_SIZE) and not a_globs.is_trial_episode:

        #Create the target training batch
        batch_inputs = np.empty(shape=(
            a_globs.BATCH_SIZE,
            a_globs.FEATURE_VECTOR_SIZE,
        ))
        batch_targets = np.empty(shape=(a_globs.BATCH_SIZE,
                                        a_globs.NUM_ACTIONS))
        batch_aux_targets = np.empty(shape=(a_globs.BATCH_SIZE,
                                            aux_target.shape[1]))

        #Add the current observation to the mini-batch
        batch_inputs[0] = cur_state_formatted
        batch_targets[0] = q_vals
        batch_aux_targets[0] = aux_target[0]

        #Use the replay buffer to learn from previously visited states
        for i in range(1, a_globs.BATCH_SIZE):
            cur_observation = do_buffer_sampling()

            #NOTE: For now If N > 1 we only want the most recent state associated
            #with the reward and next state (effectively setting N > 1 changes nothing right now since we want to use the same input type as in the regular singel task case)
            most_recent_obs_state = cur_observation.states[-1]
            sampled_state_formatted = format_states([most_recent_obs_state])
            sampled_next_state_formatted = format_states(
                [cur_observation.next_state])

            #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a))
            q_vals = get_q_vals_aux(cur_observation.next_state, True)
            cur_action_target = reward + (a_globs.GAMMA * np.max(q_vals))

            #Get the value for the current state of the action which was just taken, ie Q(S, A),
            #and set the target for the specifc action taken (we need to pass in the
            #whole vector of q_values, since our network takes state only as input)
            q_vals = get_q_vals_aux(most_recent_obs_state, False)
            q_vals[0][a_globs.cur_action] = cur_action_target

            if a_globs.AGENT == a_globs.REWARD:
                #We make the rewards positive since we care only about the binary
                #distinction between zero and non zero rewards and theano binary
                #cross entropy loss requires targets to be 0 or 1
                aux_target = np.array([[cur_observation.reward]])
            elif a_globs.AGENT == a_globs.STATE:
                aux_target = format_states([cur_observation.next_state])
            elif a_globs.AGENT == a_globs.NOISE:
                aux_target = np.array([
                    rand_un() for _ in range(a_globs.NUM_NOISE_NODES)
                ]).reshape(1, a_globs.NUM_NOISE_NODES)
            elif a_globs.AGENT == a_globs.REDUNDANT:
                nested_target = [
                    q_vals for _ in range(a_globs.NUM_REDUNDANT_TASKS)
                ]
                aux_target = np.array([
                    item for sublist in nested_target for item in sublist
                ]).reshape(1,
                           a_globs.NUM_ACTIONS * a_globs.NUM_REDUNDANT_TASKS)

            #print(i)
            batch_inputs[i] = sampled_state_formatted
            batch_targets[i] = q_vals
            batch_aux_targets[i] = aux_target[0]

        #Update the weights using the sampled batch
        a_globs.model.fit(batch_inputs, [batch_targets, batch_aux_targets],
                          batch_size=a_globs.BATCH_SIZE,
                          epochs=1,
                          verbose=0)
Ejemplo n.º 9
0
def agent_step(reward, state):

    next_state = state
    next_state_formatted = format_states([next_state])
    if not a_globs.is_trial_episode:
        update_replay_buffer(a_globs.cur_state, a_globs.cur_action, reward,
                             next_state)

    #Choose the next action, epsilon greedy style
    if rand_un() < 1 - a_globs.cur_epsilon or a_globs.is_trial_episode:
        #Get the best action over all actions possible in the next state, max_a(Q(s + 1), a))
        q_vals = a_globs.model.predict(next_state_formatted, batch_size=1)
        next_action = np.argmax(q_vals)
    else:
        next_action = rand_in_range(a_globs.NUM_ACTIONS)

    #Get the target value for the update from the target network
    q_vals = a_globs.target_network.predict(next_state_formatted, batch_size=1)
    cur_action_target = reward + a_globs.GAMMA * np.max(q_vals)

    #Get the value for the current state of the action which was just taken, ie Q(S, A),
    #and set the target for the specifc action taken (we need to pass in the
    #whole vector of q_values, since our network takes state only as input)
    cur_state_formatted = format_states([a_globs.cur_state])
    q_vals = a_globs.model.predict(cur_state_formatted, batch_size=1)

    q_vals[0][a_globs.cur_action] = cur_action_target

    #Check and see if the relevant buffer is non-empty
    if buffers_are_ready(a_globs.buffer_container,
                         a_globs.BUFFER_SIZE) and not a_globs.is_trial_episode:

        if (a_globs.is_trial_episode):
            exit("BAD!")
        buffer_states = [
            observation.states for observation in a_globs.buffer_container[0]
        ]
        #print(buffer_states)

        #Create the target training batch
        batch_inputs = np.empty(shape=(
            a_globs.BATCH_SIZE,
            a_globs.FEATURE_VECTOR_SIZE,
        ))
        batch_targets = np.empty(shape=(a_globs.BATCH_SIZE,
                                        a_globs.NUM_ACTIONS))

        #Add the current observation to the mini-batch
        batch_inputs[0] = cur_state_formatted
        batch_targets[0] = q_vals

        #Use the replay buffer to learn from previously visited states
        for i in range(1, a_globs.BATCH_SIZE):
            cur_observation = do_buffer_sampling()

            #NOTE: For now If N > 1 we only want the most recent state associated with the reward and next state (effectively setting N > 1 changes nothing right now since we want to use the same input type as in the regular singel task case)

            most_recent_obs_state = cur_observation.states[-1]
            sampled_state_formatted = format_states([most_recent_obs_state])
            sampled_next_state_formatted = format_states(
                [cur_observation.next_state])

            #Get the best action over all actions possible in the next state, ie max_a(Q(s + 1), a))
            q_vals = a_globs.target_network.predict(
                sampled_next_state_formatted, batch_size=1)
            cur_action_target = reward + (a_globs.GAMMA * np.max(q_vals))

            #Get the q_vals to adjust the learning target for the current action taken
            q_vals = a_globs.model.predict(sampled_state_formatted,
                                           batch_size=1)
            q_vals[0][a_globs.cur_action] = cur_action_target

            batch_inputs[i] = sampled_state_formatted
            batch_targets[i] = q_vals

        #Update the weights using the sampled batch
        if not a_globs.is_trial_episode:
            a_globs.model.fit(batch_inputs,
                              batch_targets,
                              batch_size=a_globs.BATCH_SIZE,
                              epochs=1,
                              verbose=0)

    if RL_num_steps(
    ) % a_globs.NUM_STEPS_TO_UPDATE == 0 and not a_globs.is_trial_episode:
        update_target_network()

    a_globs.cur_state = next_state
    a_globs.cur_action = next_action
    return next_action