Beispiel #1
0
def agent_step(
    reward, this_observation
):  # returns NumPy array, reward: floating point, observation_t: NumPy array
    global local_action, last_observation, this_action, action_value_estimates, action_counts, time_step, C

    #Update estimate for current action
    cur_action = int(this_action[0])
    action_value_estimates[cur_action] += alpha * (
        reward - action_value_estimates[cur_action])

    #Choose a new action using the parameter based agents
    stp1 = this_observation[0]
    action_selection_prob = rand_un()
    if action_selection_prob <= (1 - epsilon):
        atp1 = action_value_estimates.index(max(action_value_estimates))
    elif episode == EPSILON_GREEDY:
        atp1 = randInRange(numActions)
    else:
        print("BAD EPISODE: NO ACTION SELECTION FOR THE CURRENT AGENT!!!")
        exit()

    action_counts[atp1] += 1
    time_step += 1

    local_action[0] = atp1
    this_action = local_action
    last_observation = this_observation

    return this_action
Beispiel #2
0
def agent_start(this_observation
                ):  # returns NumPy array, this_observation: NumPy array
    global local_action, last_observation, this_action, episode, action_value_estimates, action_counts, epsilon, Q1, time_step

    #Set the parameters based on the current agent
    #We use episodes to distinguish between agent parameter settings
    if episode == OPTIMISTIC_INIT:
        epsilon = 0.0
        Q1 = 5
    elif episode == EPSILON_GREEDY:
        epsilon = 0.1
        Q1 = 0
    else:
        exit("BAD EPISODE: NO STRATEGY FOR THE CURRENT AGENT!!!")

    action_value_estimates = [Q1 for action in range(numActions)]
    action_counts = [0 for action in range(numActions)]

    stp1 = this_observation[
        0]  # how you convert observation to a number, if state is tabular
    atp1 = randInRange(numActions)

    action_counts[atp1] += 1
    local_action[0] = atp1

    last_observation = this_observation  # save observation, might be useful on the next step
    this_action = local_action

    time_step = 1
    return this_action
def agent_start(this_observation): # returns NumPy array, this_observation: NumPy array
    global local_action, last_observation, this_action#, numActions

    stp1 = this_observation[0] # how you convert observation to a number, if state is tabular
    atp1 = randInRange(numActions)
    local_action[0] = atp1

    last_observation = this_observation # save observation, might be useful on the next step
    this_action = local_action

    return this_action
def agent_step(reward, this_observation): # returns NumPy array, reward: floating point, observation_t: NumPy array
    global local_action, last_observation, this_action#, numActions

    stp1 = this_observation[0]
    atp1 = randInRange(numActions)

    # might do some learning here

    local_action[0] = atp1
    this_action = local_action
    last_observation = this_observation

    return this_action
def env_step(this_action): # returns (floating point, NumPy array, Boolean), this_action: NumPy array
    global local_observation, this_reward_observation, arms#, nStatesSimpleEnv
    episode_over = False

    atp1 = this_action[0] # how to extact action
    stp1 = randInRange(nStatesSimpleEnv) # state transitions are uniform random
    the_reward = randn(0.0, 1.0) + arms[int(atp1)] # rewards drawn from (0, 1) Gaussian
    #if rand_un() < 0.05:
    #    episode_over = True # termination is random

    local_observation[0] = stp1
    this_reward_observation = (the_reward, this_reward_observation[1], episode_over)

    return this_reward_observation
Beispiel #6
0
def agent_step(
    reward, this_observation
):  # returns NumPy array, reward: floating point, observation_t: NumPy array
    global local_action, last_observation, this_action, action_value_estimates, action_counts, time_step, C

    #Update estimate for current action
    cur_action = int(this_action[0])
    action_value_estimates[cur_action] += alpha * (
        reward - action_value_estimates[cur_action])

    #Choose a new action using the parameter based agents
    stp1 = this_observation[0]
    action_selection_prob = rand_un()

    if episode == EPSILON_GREEDY:
        if action_selection_prob <= (1 - epsilon):
            atp1 = action_value_estimates.index(max(action_value_estimates))
        else:
            atp1 = randInRange(numActions)
    elif episode == UCB:
        action_value_estimates_copy = list(action_value_estimates)
        cur_greedy_action_index = action_value_estimates_copy.index(
            max(action_value_estimates_copy))
        action_counts_copy = list(action_counts)

        #Compute the uncertainty measure for each action value estimate, and select the next action
        action_values_UCB = []
        for i in range(len(action_value_estimates_copy)):
            cur_UCB_action_value = action_value_estimates_copy[i] + (
                C *
                (math.sqrt(math.log(time_step) / (action_counts_copy[i] + 1))))
            action_values_UCB.append(cur_UCB_action_value)
            atp1 = action_values_UCB.index(max(action_values_UCB))
    else:
        exit("BAD EPISODE: NO ACTION SELECTION FOR THE CURRENT AGENT!!!")

    action_counts[atp1] += 1
    time_step += 1

    local_action[0] = atp1
    this_action = local_action
    last_observation = this_observation

    return this_action
Beispiel #7
0
def agent_start(this_observation
                ):  # returns NumPy array, this_observation: NumPy array
    global local_action, last_observation, this_action, episode, action_value_estimates, action_counts, epsilon, Q1, time_step

    action_value_estimates = [Q1 for action in range(numActions)]
    action_counts = [0 for action in range(numActions)]

    stp1 = this_observation[
        0]  # how you convert observation to a number, if state is tabular
    atp1 = randInRange(numActions)

    action_counts[atp1] += 1
    local_action[0] = atp1

    last_observation = this_observation  # save observation, might be useful on the next step
    this_action = local_action

    time_step = 1

    return this_action
Beispiel #8
0
def env_step(
    this_action
):  # returns (floating point, NumPy array, Boolean), this_action: NumPy array
    global local_observation, this_reward_observation  #, nStatesSimpleEnv
    episode_over = False

    #Get a reward from the current action reward distribution
    atp1 = int(this_action[0])  # how to extact action
    the_reward = randn(bandit_action_values[atp1],
                       1.0)  # rewards drawn from (q*, 1) Gaussian

    stp1 = randInRange(
        nStatesSimpleEnv)  # state transitions are uniform random
    #########

    local_observation[0] = stp1
    this_reward_observation = (the_reward, this_reward_observation[1],
                               episode_over)

    return this_reward_observation