def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    # select an action, based on Q
    global last_x, last_y, last_action, Model, visited, Q

    x, y = state

    Q[last_x, last_y,
      last_action] += alpha * (reward + gamma * np.max(Q[x, y, :]) -
                               Q[last_x, last_y, last_action])

    Model[last_x, last_y, last_action, 0] = x
    Model[last_x, last_y, last_action, 1] = y
    Model[last_x, last_y, last_action, 2] = reward
    visited.append((last_x, last_y, last_action))

    Dyna_Q()

    #action = np.argmax(Q[x, y, :])
    action = np.random.choice(np.where(Q[x, y, :] == Q[x, y, :].max())[0])

    if rand_un() < epsilon:
        action = rand_in_range(4)

    #print state, action

    last_x = x
    last_y = y
    last_action = action
    return last_action
Beispiel #2
0
def agent_step(reward, state):
    global actions, old_state, old_action, iht, weights, z

    #tile-coding
    scaled_ns1 = 1. * NUM_TILES * (state[0] - POSITION[0]) / (POSITION[1] -
                                                              POSITION[0])
    scaled_ns2 = 1. * NUM_TILES * (state[1] - VELOCITY[0]) / (VELOCITY[1] -
                                                              VELOCITY[0])

    hash_s = old_state
    hash_ns = np.asarray(tiles(iht, NUM_TILINGS, [scaled_ns1, scaled_ns2]))

    #epsilon-greedy
    rand = rand_un()
    if rand < EPSILON:
        n_action = random.choice(actions)
    else:
        n_action = np.argmax(np.sum(weights[:, hash_ns], axis=1))

    #learning and update traces
    q = np.sum(weights[old_action, hash_s])
    nq = np.sum(weights[n_action, hash_ns])

    z[old_action, hash_s] = 1.

    weights += ALPHA * (reward + GAMMA * nq - q) * z
    z *= GAMMA * LAMBDA

    old_state = hash_ns
    old_action = n_action
    return n_action
Beispiel #3
0
def agent_step(
    reward, this_observation
):  # returns NumPy array, reward: floating point, observation_t: NumPy array
    global local_action, last_observation, this_action, action_value_estimates, action_counts, time_step, C

    #Update estimate for current action
    cur_action = int(this_action[0])
    action_value_estimates[cur_action] += alpha * (
        reward - action_value_estimates[cur_action])

    #Choose a new action using the parameter based agents
    stp1 = this_observation[0]
    action_selection_prob = rand_un()
    if action_selection_prob <= (1 - epsilon):
        atp1 = action_value_estimates.index(max(action_value_estimates))
    elif episode == EPSILON_GREEDY:
        atp1 = randInRange(numActions)
    else:
        print("BAD EPISODE: NO ACTION SELECTION FOR THE CURRENT AGENT!!!")
        exit()

    action_counts[atp1] += 1
    time_step += 1

    local_action[0] = atp1
    this_action = local_action
    last_observation = this_observation

    return this_action
Beispiel #4
0
def agent_step(reward, state):
    global cur_state, cur_action, weights, e_trace

    next_state = state

    #Update the weights
    delta = reward
    cur_state_feature_indices = approx_value(cur_state, cur_action, weights)[1]
    for index in cur_state_feature_indices:
        delta = delta - weights[0][index]
        e_trace[0][index] = 1

    #Choose the next action, epislon-greedy style
    if rand_un() < 1 - EPSILON:
        actions = [
            approx_value(cur_state, action, weights)[0]
            for action in range(NUM_ACTIONS)
        ]
        next_action = actions.index(max(actions))
    else:
        next_action = rand_in_range(NUM_ACTIONS)

    next_state_feature_indices = approx_value(next_state, next_action,
                                              weights)[1]
    for index in next_state_feature_indices:
        delta = delta + GAMMA * weights[0][index]
    weights += ALPHA * delta * e_trace
    e_trace = GAMMA * LAMBDA * e_trace

    cur_state = next_state
    cur_action = next_action
    return cur_action
Beispiel #5
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    global alpha, gamma, actions_permitted, Q, action_list, last_action, last_state

    # select an action, based on Q
    if rand_un() < epsilon:
        action = action_list[rand_in_range(actions_permitted)]
    else:
        action = action_list[np.argmax(Q[int(state[0])][int(state[1])])]

    Q[last_state[0], last_state[1],
      find_action(last_action)] += alpha * (
          reward + gamma *
          Q[int(state[0]), int(state[1]),
            find_action(action)] - Q[last_state[0], last_state[1],
                                     find_action(last_action)])

    last_action = action
    last_state = state

    return action
Beispiel #6
0
def agent_step(reward, state):
    global q_values, actions, old_info, reward_buffer, gammas
    x = state[0][0]
    y = state[0][1]
    hash_state = y * 10 + x

    #epsilon greedy
    rand = rand_un()
    if rand <= EPSILON:
        action = rand_in_range(len(actions))
    else:
        action = np.argmax(q_values[hash_state, :])

    #learning
    reward_buffer.append(reward)

    if len(old_info) >= N:
        old_state = old_info[0][0]
        old_action = old_info[0][1]

        q_values[old_state, old_action] += ALPHA * (np.sum(gammas * np.asarray(reward_buffer))+\
            (GAMMA**(N)) * q_values[hash_state, action] - q_values[old_state, old_action])

        old_info.pop(0)
        reward_buffer.pop(0)

    old_info.append((hash_state, action))

    return actions[action]
Beispiel #7
0
 def pick_action(arr, epsilon, num_actions):
     arg_max = np.argmax(arr)
     if rand_un() < epsilon:
         action = rand_in_range(num_actions)
     else:
         action = arg_max
     return action
Beispiel #8
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    global Q, actions, last_action, last_y, last_x
    """
	Arguments: reward: floting point, state: integer
	Returns: action: floating point
	"""
    # select an action, based on Q

    # s'x and y
    x = state[0]
    y = state[1]

    if rand_un() < epsilon:
        action_index = rand_in_range(num_action)
    else:
        action_index = np.argmax(Q[x][y])  #find best action

    #update last step's Q
    Q[last_x][last_y][last_action] += alpha_step * (
        reward + Q[x][y][action_index] - Q[last_x][last_y][last_action])

    last_x = x
    last_y = y
    last_action = action_index
    action = actions[action_index]

    return action
Beispiel #9
0
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    # select an action, based on Q
    global Q, last_action, S, S_, model, previous_states

    S_ = state

    if (S[0],S[1]) not in previous_states:
        previous_states[(S[0],S[1])] = set()
    previous_states[(S[0],S[1])].add(last_action)

    Q[S[0]][S[1]][last_action] += alpha * (reward + gamma * max(Q[S_[0]][S_[1]]) - Q[S[0]][S[1]][last_action])
    model[S[0]][S[1]][last_action] = (reward, S_[0], S_[1])

    for  i in range(n):
        S_planning = random.choice(previous_states.keys())
        A_planning = random.sample(previous_states[S_planning], 1)
        reward_planning, x_planning, y_planning = model[S_planning[0]][S_planning[1]][A_planning[0]]
        Q[S_planning[0]][S_planning[1]][A_planning[0]] += alpha * (reward_planning + gamma * max(Q[x_planning][y_planning]) -  Q[S_planning[0]][S_planning[1]][A_planning[0]])

    if rand_un() < epsilon:
        action = rand_in_range(4)
    else:
        action = argmax(Q[S[0]][S[1]])

    S = S_
    last_action = action
    
    return action
Beispiel #10
0
def agent_start(state):
	global Q,last_action,last_y,last_x,model
	"""
	Hint: Initialize the variavbles that you want to reset before starting a new episode
	Arguments: state: numpy array
	Returns: action: integer list
	"""
	# pick the first action, don't forget about exploring starts
	x = state[0]
	y = state[1]
	if rand_un() < epsilon:
		action_index = rand_in_range(num_action)
	else:
		action_index = np.argmax(Q[x][y]) #find best action
		if  Q[x][y][action_index] == 0:
			action_index = rand_in_range(4)



	last_action = action_index
	last_x = x
	last_y = y

	action = actions[action_index]
	return action
Beispiel #11
0
def agent_step(reward, state):
    global q_values, actions, trajectory
    x = state[0][0]
    y = state[0][1]
    hash_state = y * 10 + x

    #epsilon greedy
    rand = rand_un()
    if rand <= EPSILON:
        action = rand_in_range(len(actions))
    else:
        action = np.argmax(q_values[hash_state, :])

    #print state, action

    #learning
    n = len(trajectory)
    for i, (s, a) in enumerate(trajectory):
        #Wn = math.exp(-0.5*(g - (r+self.values[ns]))**2)
        q_values[s, a] += 1. / N[s, a] * (GAMMA**(n - 1 - i)) * (
            reward + GAMMA * q_values[hash_state, action] - q_values[s, a])

    old_state = hash_state
    old_action = action

    #if (hash_state, action) not in trajectory:
    trajectory.append((hash_state, action))
    N[hash_state, action] += 1
    return actions[action]
Beispiel #12
0
def env_step(action):
    global current_state

    reward = 0.0
    is_terminal = False

    #go left
    if action == 'left':
        left_stop = max(0, current_state[0] - 100)
        left_range = current_state[0] - left_stop
        left_prob = 1. * (100 - (left_range - 1)) / 100
        rand = rand_un()
        if rand <= left_prob:
            current_state[0] = left_stop
        else:
            current_state[0] = random.choice(
                range(left_stop + 1, current_state[0]))
    #go right
    else:
        right_stop = min(1001, current_state[0] + 100)
        right_range = right_stop - current_state[0]
        right_prob = 1. * (100 - (right_range - 1)) / 100
        rand = rand_un()
        if rand <= right_prob:
            current_state[0] = right_stop

        else:
            current_state[0] = random.choice(
                range(current_state[0] + 1, right_stop))

    if current_state[0] == 0:
        is_terminal = True
        current_state = None
        reward = -1.0

    elif current_state[0] == 1001:
        is_terminal = True
        current_state = None
        reward = 1.0

    result = {
        "reward": reward,
        "state": current_state,
        "isTerminal": is_terminal
    }

    return result
def action_select(s):
    if rand_un() < e:
        #explore
        return rand_in_range(4)
    else:
        #decide action based on policy
        return np.random.choice(
            np.where(Q[s[0]][s[1]] == np.amax(Q[s[0]][s[1]]))[0])
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: should be a constant -1 until episode termination
    Returns action as 0,1,or 2
    """
    global  old_state, iht, curr_state, last_action, alpha, w, gamma, num_tilings, tile_width, epsilon, z, lambd
    

    action = None
    curr_state = state
    err = reward
 
    #unpack the np arrays
    x,xdot = state
    o_x, o_xdot =  old_state

    

    
    #calculate indices of affected features of curr_state and old_state
    features_old = tiles(iht,num_tilings,[((o_x/(0.5+1.2))*(1/tile_width)), ((o_xdot/(0.07+0.07))*(1/tile_width))],[last_action]) #x(s) components.

    for i in features_old:
        err -= w[i] 
        z[i] = 1.0


    ######## ACTION SELECTION ########
    ExploreExploit = rand_un()
    if ExploreExploit < epsilon:
        action = random.randrange(0,3) # Explore
    else:

    ## GREEDY ACTION ##
        max_q  = -999999999.9 #approx -inf
        for i in range(0,3): #number of poss actions
            Xs = tiles(iht,num_tilings,[((x/(0.5+1.2))*(1/tile_width)), ((xdot/(0.07+0.07))*(1/tile_width))], [i]) #x(s') components. 
            est_q = 0.0
            for j in Xs:
                est_q += w[j]

            if est_q > max_q:
                max_q = est_q
                action  =  i

    features_curr = tiles(iht,num_tilings,[((x/(0.5+1.2))*(1/tile_width)), ((xdot/(0.07+0.07))*(1/tile_width))], [action]) #x(s') components. 
    for i in features_curr:
    	err += gamma*w[i]

    w += alpha*err*z
    z = z*gamma*lambd


    old_state = state
    last_action = action
    

    return action
def agent_step(reward, state):
    global state_action_values, cur_state, cur_action, cur_epsilon

    next_state = state
    #Choose the next action, epsilon greedy style
    if AGENT == TABULAR:
        if rand_un() < 1 - cur_epsilon:
            #Need to ensure that an action is picked uniformly at random from among those that tie for maximum
            cur_max = state_action_values[state[0]][state[1]][0]
            max_indices = [0]
            for i in range(1, len(state_action_values[state[0]][state[1]])):
                if state_action_values[state[0]][state[1]][i] > cur_max:
                    cur_max = state_action_values[state[0]][state[1]][i]
                    max_indices = [i]
                elif state_action_values[state[0]][state[1]][i] == cur_max:
                    max_indices.append(i)
            next_action = max_indices[rand_in_range(len(max_indices))]
        else:
            next_action = rand_in_range(NUM_ACTIONS)

        #Update the state action values
        next_state_max_action = state_action_values[next_state[0]][next_state[1]].index(max(state_action_values[next_state[0]][next_state[1]]))
        state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + GAMMA * state_action_values[next_state[0]][next_state[1]][next_state_max_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action])

    elif AGENT == NEURAL:
        #Choose the next action, epsilon greedy style
        if rand_un() < 1 - cur_epsilon:

            #Get the best action over all actions possible in the next state,
            q_vals = model.predict(encode_1_hot(next_state), batch_size=1)
            q_max = np.max(q_vals)
            next_action = np.argmax(q_vals)
            cur_action_target = reward + GAMMA * q_max

            #Get the value for the current state for which the action was just taken
            cur_state_1_hot = encode_1_hot(cur_state)
            q_vals = model.predict(cur_state_1_hot, batch_size=1)
            q_vals[0][cur_action] = cur_action_target
            model.fit(cur_state_1_hot, q_vals, batch_size=1, epochs=1, verbose=0)
        else:
            next_action = rand_in_range(NUM_ACTIONS)

    cur_state = next_state
    cur_action = next_action
    return next_action
Beispiel #16
0
def agent_start(state):
    global state_action_values, cur_state, cur_action

    #Choose the next action, epsilon greedy style
    cur_state = state
    if rand_un() < 1 - EPSILON:
        cur_action = state_action_values[state[0]][state[1]].index(max(state_action_values[state[0]][state[1]]))
    else:
        cur_action = rand_in_range(NUM_ACTIONS)

    return cur_action
Beispiel #17
0
 def pick_action(self):
     if self._last_action == None:
         self.pick_first_action()
     # arg_max = self._find_max(self._Q)
     arg_max = np.argmax(self._Q)
     if rand_un() < self._epsilon:
         action = rand_in_range(self._num_actions)
     else:
         action = arg_max
     self._last_action = action
     # print('action is {}'.format(self._last_action))
     return action
Beispiel #18
0
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array
	global Q,last_action,last_y,last_x,model
	"""
	Arguments: reward: floting point, state: integer
	Returns: action: floating point
	"""
	# select an action, based on Q

	# s'x and y 
	x = state[0] 
	y = state[1]



	Q[last_x][last_y][last_action] += alpha_step * (reward+  gamma*np.max(Q[x][y]) - Q[last_x][last_y][last_action])


	modelKey = (last_x,last_y,last_action)
	model[modelKey] = [reward,x,y]

	i = 0
	while i < n:
		i += 1
		chosen = False
		while not chosen:
			modelX = rand_in_range(9)
			modelY = rand_in_range(6)
			modelA = rand_in_range(4)
			if model[(modelX,modelY,modelA)][0] != -1.0:
				chosen = True

		modelNextY = model[(modelX,modelY,modelA)][2]
		modelNextX = model[(modelX,modelY,modelA)][1]
		modelReward = model[(modelX,modelY,modelA)][0]

		Q[modelX][modelY][modelA] += alpha_step * (modelReward +gamma*np.max(Q[modelNextX][modelNextY]) - Q[modelX][modelY][modelA])


	if rand_un() < epsilon:
		action_index = rand_in_range(num_action)
	else:
		action_index = np.argmax(Q[x][y]) #find best action
		if  Q[x][y][action_index] == 0:
			action_index = rand_in_range(4)


	last_x = x
	last_y = y
	last_action = action_index
	action = actions[action_index]


	return action
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    global pre, policy, ph, e
    if int(state) > int(pre[0]):
        ph[0] += 1
    ph[1] += 1
    policy_create(pre, reward, state)

    if rand_un() > e:
        action = 1 + int(rand_un() * min(state, 99 - state + 1))
    else:
        action = int(policy[state])
    pre[0] = state
    pre[1] = action
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    # select an action, based on Q
    return action
def agent_start(state):
    global pre
    """
    Hint: Initialize the variavbles that you want to reset before starting a new episode
    Arguments: state: numpy array
    Returns: action: integer
    """
    # pick the first action, don't forget about exploring starts
    action = 1 + int(rand_un() * min(state, 99 - state + 1))
    pre[0] = state
    pre[1] = action
    return action
Beispiel #21
0
def agent_step(reward, state):
    global state_action_values, cur_state, cur_action

    next_state = state
    if AGENT == TABULAR:
        #Choose the next action, epsilon greedy style
        cur_state = state
        if rand_un() < 1 - EPSILON:
            next_action = state_action_values[state[0]][state[1]].index(max(state_action_values[state[0]][state[1]]))
        else:
            next_action = rand_in_range(NUM_ACTIONS)

        #Update the state action values
        next_state_max_action = state_action_values[next_state[0]][next_state[1]].index(max(state_action_values[next_state[0]][next_state[1]]))
        state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + GAMMA * state_action_values[next_state[0]][next_state[1]][next_state_max_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action])

        #print(state_action_values)
    elif AGENT == NEURAL:
        #Choose the next action, epsilon greedy style
        if rand_un() < 1 - EPSILON:

            #Get the best action over all actions possible in the next state,
            q_vals = model.predict(encode_1_hot(next_state), batch_size=1)
            q_max = np.max(q_vals)
            next_action = np.argmax(q_vals)
            cur_action_target = reward + GAMMA * q_max

            #Get the value for the current state for which the action was just taken
            cur_state_1_hot = encode_1_hot(cur_state)
            q_vals = model.predict(cur_state_1_hot, batch_size=1)
            q_vals[0][cur_action] = cur_action_target
            model.fit(cur_state_1_hot, q_vals, batch_size=1, epochs=1, verbose=0)
        else:
            next_action = rand_in_range(NUM_ACTIONS)

    #print(AGENT)
    #print(next_action)
    cur_state = next_state
    cur_action = next_action
    return next_action
def agent_step(reward, state):
    global old_action, old_state
    if rand_un() < e:
        action = rand_in_range(movement)
    else:
        action = np.argmax(Q[state[0]][state[1]])
    Q[old_state[0]][old_state[1]][old_action] += a * (
        reward + Q[state[0]][state[1]][action] -
        Q[old_state[0]][old_state[1]][old_action])
    old_state[0] = state[0]
    old_state[1] = state[1]
    old_action = action
    return action
Beispiel #23
0
def agent_start(this_observation): # returns NumPy array, this_observation: NumPy array
    global last_action

    local_action = np.zeros(1)
    if rand_un() < 0: # you may change it to 0 or 0.1
        local_action[0] = rand_in_range(num_actions)
    else:
        local_action[0] = findGreedyAction()
    last_action = local_action



    return local_action[0]
Beispiel #24
0
def choose_action(state):
    global Q
    mx = np.amax(Q[state[0], state[1], :])
    arg = np.argwhere(Q[state[0], state[1], :] == mx)
    inds = np.reshape(arg, np.size(arg))
    greedy_action = np.random.choice(inds)

    toss = rand_un()
    if toss > epsilon:
        action = greedy_action
    else:
        action = rand_in_range(NUMBER_OF_ACTIONS)

    return action
Beispiel #25
0
def agent_start(state):
    global cur_state, cur_action, weights
    cur_state = state

    #Choose the next action, epislon-greedy style
    if rand_un() < 1 - EPSILON:
        actions = [
            approx_value(cur_state, action, weights)[0]
            for action in range(NUM_ACTIONS)
        ]
        cur_action = actions.index(max(actions))
    else:
        cur_action = rand_in_range(NUM_ACTIONS)

    return cur_action
Beispiel #26
0
def agent_step(reward, state):
    global state_action_values, cur_state, cur_action

    next_state = state
    #Choose the next action, epsilon greedy style
    if rand_un() < 1 - EPSILON:
        next_action = state_action_values[state[0]][state[1]].index(max(state_action_values[state[0]][state[1]]))
    else:
        next_action = rand_in_range(NUM_ACTIONS)

    #Update the state action value function
    state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + state_action_values[next_state[0]][next_state[1]][next_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action])

    cur_state = next_state
    cur_action = next_action
    return next_action
Beispiel #27
0
def agent_start(state):
    global actions, old_state, old_action, q_values

    x = state[0][0]
    y = state[0][1]
    hash_state = y * 10 + x  # there are 70 states in total

    rand = rand_un()
    if rand <= EPSILON:
        action = rand_in_range(len(actions))
    else:
        action = np.argmax(q_values[hash_state, :])

    old_state = hash_state
    old_action = action

    return actions[action]
Beispiel #28
0
def agent_start(state):
    """
    Hint: Initialize the variavbles that you want to reset before starting a new episode
    Arguments: state: numpy array
    Returns: action: integer
    """
    global actions_permitted, Q, action_list, last_action, last_state

    # pick the first action, don't forget about exploring starts
    if rand_un() < epsilon:
        action = action_list[rand_in_range(actions_permitted)]
    else:
        action = action_list[np.argmax(Q[state[0]][state[1]])]
    last_action = action
    last_state = state

    return action
Beispiel #29
0
def agent_start(state):
    global actions, old_info, reward_buffer

    x = state[0][0]
    y = state[0][1]
    hash_state = y * 10 + x

    rand = rand_un()
    if rand <= EPSILON:
        action = rand_in_range(len(actions))
    else:
        action = np.argmax(q_values[hash_state, :])

    old_info = []
    old_info.append((hash_state, action))
    reward_buffer = []

    return actions[action]
Beispiel #30
0
def env_step(
    this_action
):  # returns (floating point, NumPy array, Boolean), this_action: NumPy array
    global local_observation, this_reward_observation  #, nStatesSimpleEnv
    episode_over = False
    the_reward = randn(0.0, 1.0)  # rewards drawn from (0, 1) Gaussian

    atp1 = this_action[0]  # how to extact action
    stp1 = randInRange(
        nStatesSimpleEnv)  # state transitions are uniform random
    if rand_un() < 0.05:
        episode_over = True  # termination is random

    local_observation[0] = stp1
    this_reward_observation = (the_reward, this_reward_observation[1],
                               episode_over)

    return this_reward_observation