Ejemplo n.º 1
0
def agent_start(state):
    """
    Hint: Initialize the variavbles that you want to reset before starting a new episode
    Arguments: state: numpy array
    Returns: action: integer
    """
    global Q, last_action, epsilon, last_state

    select_option = np.array([0, 1])
    option = np.random.choice(select_option, p=[epsilon, 1 - epsilon])
    x = state[0]
    y = state[1]

    if option == 0:
        action_num = rand_in_range(4)

    else:
        action_num = np.argmax(Q[y][x])
        if Q[y][x][action_num] == 0:
            action_num = rand_in_range(4)

    last_action[0] = action_num
    action = last_action[0]
    last_state = state

    return action
Ejemplo n.º 2
0
def agent_start(state):
	global Q,last_action,last_y,last_x,model
	"""
	Hint: Initialize the variavbles that you want to reset before starting a new episode
	Arguments: state: numpy array
	Returns: action: integer list
	"""
	# pick the first action, don't forget about exploring starts
	x = state[0]
	y = state[1]
	if rand_un() < epsilon:
		action_index = rand_in_range(num_action)
	else:
		action_index = np.argmax(Q[x][y]) #find best action
		if  Q[x][y][action_index] == 0:
			action_index = rand_in_range(4)



	last_action = action_index
	last_x = x
	last_y = y

	action = actions[action_index]
	return action
def agent_step(
    reward, this_observation
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    global last_action
    #the action at this time step
    taken_action = int(this_observation[0])
    #how many times this action has been taken in current run
    if np.sum(op_values) != 0:
        op_values[taken_action] = op_values[taken_action] + (
            reward - op_values[taken_action]) / 10.0
        # might do some learning here
        last_action[0] = np.argmax(op_values)

        return last_action
    #the estimate the action value
    estimate_values[taken_action] = estimate_values[taken_action] + (
        reward - estimate_values[taken_action]) / 10.0
    # might do some learning here
    current_op_action = np.argmax(estimate_values)
    epsilon = rand_in_range(10)
    if epsilon == 0:
        last_action[0] = rand_in_range(num_actions)
    else:
        last_action[0] = current_op_action

    return last_action
Ejemplo n.º 4
0
def agent_start(this_observation
                ):  # returns NumPy array, this_observation: NumPy array
    global last_action
    last_action[0] = rand_in_range(num_actions)

    local_action = np.zeros(1, dtype='int32')
    local_action[0] = rand_in_range(num_actions)

    return local_action
Ejemplo n.º 5
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: floating point
    """

    global Q, last_action, epsilon, alpha, last_state, gamma, pre_obs_state, pre_obs_action

    select_option = np.array([0, 1])
    option = np.random.choice(select_option, p=[epsilon, 1 - epsilon])
    x = state[0]
    y = state[1]

    if option == 0:
        action = rand_in_range(4)  # change this to 9 to rand in 9 actions

    else:
        action = np.argmax(Q[y][x])
        if Q[y][x][action] == 0:
            action = rand_in_range(4)

    if state not in pre_obs_state:
        pre_obs_state.append(state)
        pre_obs_action[tuple(state)] = []
        if action not in pre_obs_action[tuple(state)]:
            pre_obs_action[tuple(state)].append(action)

    Q[last_state[1]][last_state[0]][last_action] += alpha * (
        reward + gamma * np.max(Q[y][x]) -
        Q[last_state[1]][last_state[0]][last_action])

    model[last_state[1]][last_state[0]][last_action] = [x, y, reward]

    for i in range(n):
        rand_index = rand_in_range(len(pre_obs_state))
        S_x = pre_obs_state[rand_index][0]
        S_y = pre_obs_state[rand_index][1]

        index = rand_in_range(len(pre_obs_action[(S_x, S_y)]))
        rand_action = pre_obs_action[(S_x, S_y)][index]

        next_state = [
            model[S_y][S_x][rand_action][0], model[S_y][S_x][rand_action][1]
        ]
        Rwd = model[S_y][S_x][rand_action][2]

        Q[S_y][S_x][rand_action] += alpha * (
            Rwd + gamma * np.max(Q[next_state[1]][next_state[0]]) -
            Q[S_y][S_x][rand_action])

    last_action = action
    last_state = state

    return action
Ejemplo n.º 6
0
def sample_from_buffers(buffer_one, buffer_two=None):
    """
    Sample a transiton uniformly at random from one of buffer_one and buffer_two.
    Which buffer is sampled is dependent on the current time step, and done in a
    way so as to sample equally from both buffers throughout an episode"
    """
    if RL_num_steps() % 2 == 0 or buffer_two is None:
        cur_transition = buffer_one[rand_in_range(len(buffer_one))]
    else:
        cur_transition = buffer_two[rand_in_range(len(buffer_two))]
    return cur_transition
Ejemplo n.º 7
0
def agent_start(this_observation
                ):  # returns NumPy array, this_observation: NumPy array
    global last_action

    last_action[0] = rand_in_range(num_actions)

    local_action = np.zeros(1)
    local_action[0] = rand_in_range(num_actions)

    # return local_action[0]
    return agent.pick_action()
Ejemplo n.º 8
0
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array
	global Q,last_action,last_y,last_x,model
	"""
	Arguments: reward: floting point, state: integer
	Returns: action: floating point
	"""
	# select an action, based on Q

	# s'x and y 
	x = state[0] 
	y = state[1]



	Q[last_x][last_y][last_action] += alpha_step * (reward+  gamma*np.max(Q[x][y]) - Q[last_x][last_y][last_action])


	modelKey = (last_x,last_y,last_action)
	model[modelKey] = [reward,x,y]

	i = 0
	while i < n:
		i += 1
		chosen = False
		while not chosen:
			modelX = rand_in_range(9)
			modelY = rand_in_range(6)
			modelA = rand_in_range(4)
			if model[(modelX,modelY,modelA)][0] != -1.0:
				chosen = True

		modelNextY = model[(modelX,modelY,modelA)][2]
		modelNextX = model[(modelX,modelY,modelA)][1]
		modelReward = model[(modelX,modelY,modelA)][0]

		Q[modelX][modelY][modelA] += alpha_step * (modelReward +gamma*np.max(Q[modelNextX][modelNextY]) - Q[modelX][modelY][modelA])


	if rand_un() < epsilon:
		action_index = rand_in_range(num_action)
	else:
		action_index = np.argmax(Q[x][y]) #find best action
		if  Q[x][y][action_index] == 0:
			action_index = rand_in_range(4)


	last_x = x
	last_y = y
	last_action = action_index
	action = actions[action_index]


	return action
Ejemplo n.º 9
0
def epsilon_greedy(state1, Q1):
    global action
    #rand_in_range(10)>0 means 90% agent goes for greedy choice
    if rand_in_range(10) > 0:
        #find the action with the largest estimated Q value
        action_index = choose_random_largest(Q1[state1[0]][state1[1]])
    else:
        #10% agent choose from random

        action_index = rand_in_range(len(action))

    return action_index
Ejemplo n.º 10
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: floating point
    """

    global Q, last_action, epsilon, alpha, last_state, gamma, n

    select_option = np.array([0, 1])
    option = np.random.choice(select_option, p=[epsilon, 1 - epsilon])
    current_x = state[0]
    current_y = state[1]

    if option == 0:
        action_num = rand_in_range(4)  # change this to 9 to rand in 9 actions

    else:
        action_num = np.argmax(Q[current_y][current_x])
        if Q[current_y][current_x][action_num] == 0:
            action_num = rand_in_range(4)

    Q[last_state[1]][last_state[0]][last_action[0]] += alpha * (
        reward + gamma * np.max(Q[current_y][current_x]) -
        Q[last_state[1]][last_state[0]][last_action[0]])

    model[(last_state[1],
           last_state[0])][last_action[0]] = [current_x, current_y, reward]

    for i in range(n):
        exist = False
        while not exist:
            model_x = rand_in_range(9)
            model_y = rand_in_range(6)
            model_action = rand_in_range(4)
            if model[(model_y, model_x)][model_action][2] != -1:
                exist = True

        S_x = model[(model_y, model_x)][model_action][0]
        S_y = model[(model_y, model_x)][model_action][1]
        Rwd = model[(model_y, model_x)][model_action][2]

        Q[model_y][model_x][model_action] += alpha * (
            Rwd + gamma * np.max(Q[S_y][S_x]) -
            Q[model_y][model_x][model_action])

    last_action[0] = action_num
    last_state = state
    action = last_action[0]

    return action
Ejemplo n.º 11
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    # select an action, based on Q
    global last_x, last_y, last_action, Model, visited, Q

    x, y = state

    Q[last_x, last_y,
      last_action] += alpha * (reward + gamma * np.max(Q[x, y, :]) -
                               Q[last_x, last_y, last_action])

    Model[last_x, last_y, last_action, 0] = x
    Model[last_x, last_y, last_action, 1] = y
    Model[last_x, last_y, last_action, 2] = reward
    visited.append((last_x, last_y, last_action))

    Dyna_Q()

    #action = np.argmax(Q[x, y, :])
    action = np.random.choice(np.where(Q[x, y, :] == Q[x, y, :].max())[0])

    if rand_un() < epsilon:
        action = rand_in_range(4)

    #print state, action

    last_x = x
    last_y = y
    last_action = action
    return last_action
Ejemplo n.º 12
0
def agent_step(reward, state):
    global old_action, old_state, Q, old_action, a
    # chose action e greedy
    action = action_select(state)
    old0, old1 = old_state[0], old_state[1]
    gamma_Q = g * np.amax(Q[state[0]][state[1]])
    #updating Q
    Q[old0][old1][old_action] += a * (reward + gamma_Q -
                                      Q[old0][old1][old_action])
    #updating the model
    model[old0][old1][old_action] = np.array([state[0], state[1], int(reward)])
    if n != 0:
        for i in range(n):
            rs = visited[rand_in_range(len(visited))]
            xyr = model[rs[0][0]][rs[0][1]][rs[1]]
            Q[rs[0][0]][rs[0][1]][rs[1]] += a * (xyr[2] + (
                g * np.amax(Q[xyr[0]][xyr[1]])) - Q[rs[0][0]][rs[0][1]][rs[1]])
    #updating old action and state
    old_action = action
    old_state[0] = state[0]
    old_state[1] = state[1]
    #updating visited if not already visited
    visit = [[state[0], state[1]], action]
    if visit not in visited:
        visited.append(visit)
    # update old action and state
    return action
Ejemplo n.º 13
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    global Q, last_action, last_state, total_actions
    """
    Arguments: reward: floting point, state: integer
    Returns: action: floating point
    """
    # select an action, based on Q

    select_option = np.array([0, 1])
    option = np.random.choice(select_option, p=[epsilon, 1 - epsilon])

    current_x = state[0]
    current_y = state[1]
    last_x = last_state[0]
    last_y = last_state[1]
    if option == 0:
        action = rand_in_range(total_actions)
    else:
        action = np.argmax(Q[state[0]][state[1]])

    Q[last_x][last_y][last_action] += alpha_step * (
        reward + Q[current_x][current_y][action] -
        Q[last_x][last_y][last_action])

    last_action = action
    last_x = current_x
    last_y = current_y
    last_state = [last_x, last_y]
    return action
Ejemplo n.º 14
0
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    # select an action, based on Q
    global Q, last_action, S, S_, model, previous_states

    S_ = state

    if (S[0],S[1]) not in previous_states:
        previous_states[(S[0],S[1])] = set()
    previous_states[(S[0],S[1])].add(last_action)

    Q[S[0]][S[1]][last_action] += alpha * (reward + gamma * max(Q[S_[0]][S_[1]]) - Q[S[0]][S[1]][last_action])
    model[S[0]][S[1]][last_action] = (reward, S_[0], S_[1])

    for  i in range(n):
        S_planning = random.choice(previous_states.keys())
        A_planning = random.sample(previous_states[S_planning], 1)
        reward_planning, x_planning, y_planning = model[S_planning[0]][S_planning[1]][A_planning[0]]
        Q[S_planning[0]][S_planning[1]][A_planning[0]] += alpha * (reward_planning + gamma * max(Q[x_planning][y_planning]) -  Q[S_planning[0]][S_planning[1]][A_planning[0]])

    if rand_un() < epsilon:
        action = rand_in_range(4)
    else:
        action = argmax(Q[S[0]][S[1]])

    S = S_
    last_action = action
    
    return action
Ejemplo n.º 15
0
 def pick_action(arr, epsilon, num_actions):
     arg_max = np.argmax(arr)
     if rand_un() < epsilon:
         action = rand_in_range(num_actions)
     else:
         action = arg_max
     return action
Ejemplo n.º 16
0
def agent_step(reward, state):
    global cur_state, cur_action, weights, e_trace

    next_state = state

    #Update the weights
    delta = reward
    cur_state_feature_indices = approx_value(cur_state, cur_action, weights)[1]
    for index in cur_state_feature_indices:
        delta = delta - weights[0][index]
        e_trace[0][index] = 1

    #Choose the next action, epislon-greedy style
    if rand_un() < 1 - EPSILON:
        actions = [
            approx_value(cur_state, action, weights)[0]
            for action in range(NUM_ACTIONS)
        ]
        next_action = actions.index(max(actions))
    else:
        next_action = rand_in_range(NUM_ACTIONS)

    next_state_feature_indices = approx_value(next_state, next_action,
                                              weights)[1]
    for index in next_state_feature_indices:
        delta = delta + GAMMA * weights[0][index]
    weights += ALPHA * delta * e_trace
    e_trace = GAMMA * LAMBDA * e_trace

    cur_state = next_state
    cur_action = next_action
    return cur_action
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    # select an action, based on Q
    global w
    global z
    global last_state
    global last_action

    error = reward
    for t in my_tiles(last_state, last_action):
        error -= w[t]
        z[t] = 1

    prob = np.random.rand()
    if prob < epsilon:
        action = rand_in_range(3)
    else:
        action = max_q_hat(state)

    for t in my_tiles(state, action):
        error += discount * w[t]
    for i in range(memorySize):
        w[i] += alpha * error * z[i]
        z[i] = z[i] * discount * lamb
    last_state = state
    last_action = action
    return action
Ejemplo n.º 18
0
def agent_end(reward):
    """
    Arguments: reward: floating point
    Returns: Nothing
    """

    global Q, returns, path
    # do learning and update pi

    for stop in path:
        # print(stop)
        if stop in returns:
            returns[stop].append(reward)
        else:
            returns[stop] = [reward]

    for key in returns:

        Q[key[0]][key[1]] = (sum(returns[(key[0], key[1])]) /
                             len(returns[(key[0], key[1])]))

    # print()
    for state in range(1, 100):
        if np.argmax(Q[state]) == 0:
            pi[state] = rand_in_range(min(state, 100 - state)) + 1
        else:
            pi[state] = np.argmax(Q[state])
    return
Ejemplo n.º 19
0
def agent_step(reward, state):
    global q_values, actions, old_info, reward_buffer, gammas
    x = state[0][0]
    y = state[0][1]
    hash_state = y * 10 + x

    #epsilon greedy
    rand = rand_un()
    if rand <= EPSILON:
        action = rand_in_range(len(actions))
    else:
        action = np.argmax(q_values[hash_state, :])

    #learning
    reward_buffer.append(reward)

    if len(old_info) >= N:
        old_state = old_info[0][0]
        old_action = old_info[0][1]

        q_values[old_state, old_action] += ALPHA * (np.sum(gammas * np.asarray(reward_buffer))+\
            (GAMMA**(N)) * q_values[hash_state, action] - q_values[old_state, old_action])

        old_info.pop(0)
        reward_buffer.pop(0)

    old_info.append((hash_state, action))

    return actions[action]
Ejemplo n.º 20
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    global Q, actions, last_action, last_y, last_x
    """
	Arguments: reward: floting point, state: integer
	Returns: action: floating point
	"""
    # select an action, based on Q

    # s'x and y
    x = state[0]
    y = state[1]

    if rand_un() < epsilon:
        action_index = rand_in_range(num_action)
    else:
        action_index = np.argmax(Q[x][y])  #find best action

    #update last step's Q
    Q[last_x][last_y][last_action] += alpha_step * (
        reward + Q[x][y][action_index] - Q[last_x][last_y][last_action])

    last_x = x
    last_y = y
    last_action = action_index
    action = actions[action_index]

    return action
Ejemplo n.º 21
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: integer
    """
    global alpha, gamma, actions_permitted, Q, action_list, last_action, last_state

    # select an action, based on Q
    if rand_un() < epsilon:
        action = action_list[rand_in_range(actions_permitted)]
    else:
        action = action_list[np.argmax(Q[int(state[0])][int(state[1])])]

    Q[last_state[0], last_state[1],
      find_action(last_action)] += alpha * (
          reward + gamma *
          Q[int(state[0]), int(state[1]),
            find_action(action)] - Q[last_state[0], last_state[1],
                                     find_action(last_action)])

    last_action = action
    last_state = state

    return action
Ejemplo n.º 22
0
def agent_step(reward, state):
    global q_values, actions, trajectory
    x = state[0][0]
    y = state[0][1]
    hash_state = y * 10 + x

    #epsilon greedy
    rand = rand_un()
    if rand <= EPSILON:
        action = rand_in_range(len(actions))
    else:
        action = np.argmax(q_values[hash_state, :])

    #print state, action

    #learning
    n = len(trajectory)
    for i, (s, a) in enumerate(trajectory):
        #Wn = math.exp(-0.5*(g - (r+self.values[ns]))**2)
        q_values[s, a] += 1. / N[s, a] * (GAMMA**(n - 1 - i)) * (
            reward + GAMMA * q_values[hash_state, action] - q_values[s, a])

    old_state = hash_state
    old_action = action

    #if (hash_state, action) not in trajectory:
    trajectory.append((hash_state, action))
    N[hash_state, action] += 1
    return actions[action]
Ejemplo n.º 23
0
def agent_step(reward, state):
    """
	Arguments: reward: floating point, state: integer
	Returns: action: integer
	"""
    global Q, last_action, last_state

    # choose A' from S' using policy derived from Q
    # 0 represent exploration, 1 represent exploitation
    choice = np.array([0, 1])
    result = np.random.choice(choice, p=[epsilon, 1 - epsilon])

    if result == 0:
        # exploration
        action = rand_in_range(num_actions)

    else:
        # exploitation
        action = np.argmax(Q[state[0], state[1], :])

    Q[last_state[0], last_state[1], last_action] = Q[last_state[0], last_state[1], last_action] + \
      alpha*(reward + Q[state[0], state[1], action] -  Q[last_state[0], last_state[1], last_action])

    last_state = state
    last_action = action

    return action
Ejemplo n.º 24
0
def env_start():
    """ returns numpy array """
    global current_state

    state = rand_in_range(
        num_total_states) + 1  # This is required for exploring starts
    current_state = np.asarray([state])
    return current_state
Ejemplo n.º 25
0
def action_select(s):
    if rand_un() < e:
        #explore
        return rand_in_range(4)
    else:
        #decide action based on policy
        return np.random.choice(
            np.where(Q[s[0]][s[1]] == np.amax(Q[s[0]][s[1]]))[0])
Ejemplo n.º 26
0
def agent_step(reward, state):
    global state_action_values, cur_state, cur_action, cur_epsilon

    next_state = state
    #Choose the next action, epsilon greedy style
    if AGENT == TABULAR:
        if rand_un() < 1 - cur_epsilon:
            #Need to ensure that an action is picked uniformly at random from among those that tie for maximum
            cur_max = state_action_values[state[0]][state[1]][0]
            max_indices = [0]
            for i in range(1, len(state_action_values[state[0]][state[1]])):
                if state_action_values[state[0]][state[1]][i] > cur_max:
                    cur_max = state_action_values[state[0]][state[1]][i]
                    max_indices = [i]
                elif state_action_values[state[0]][state[1]][i] == cur_max:
                    max_indices.append(i)
            next_action = max_indices[rand_in_range(len(max_indices))]
        else:
            next_action = rand_in_range(NUM_ACTIONS)

        #Update the state action values
        next_state_max_action = state_action_values[next_state[0]][next_state[1]].index(max(state_action_values[next_state[0]][next_state[1]]))
        state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + GAMMA * state_action_values[next_state[0]][next_state[1]][next_state_max_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action])

    elif AGENT == NEURAL:
        #Choose the next action, epsilon greedy style
        if rand_un() < 1 - cur_epsilon:

            #Get the best action over all actions possible in the next state,
            q_vals = model.predict(encode_1_hot(next_state), batch_size=1)
            q_max = np.max(q_vals)
            next_action = np.argmax(q_vals)
            cur_action_target = reward + GAMMA * q_max

            #Get the value for the current state for which the action was just taken
            cur_state_1_hot = encode_1_hot(cur_state)
            q_vals = model.predict(cur_state_1_hot, batch_size=1)
            q_vals[0][cur_action] = cur_action_target
            model.fit(cur_state_1_hot, q_vals, batch_size=1, epochs=1, verbose=0)
        else:
            next_action = rand_in_range(NUM_ACTIONS)

    cur_state = next_state
    cur_action = next_action
    return next_action
Ejemplo n.º 27
0
def agent_start(state):
    global state_action_values, cur_state, cur_action

    if AGENT == TABULAR:
        #All value functions are initialized to zero, so we can just select randomly for the first action, since they all tie
        cur_action = rand_in_range(NUM_ACTIONS)
    elif AGENT == NEURAL:
        cur_action = get_max_action(state)
    return cur_action
Ejemplo n.º 28
0
def agent_start(state):
    # pick the first action, don't forget about exploring starts
    global action_hist

    #choose a random action
    action = rand_in_range(min(state[0], 100 - state[0])) + 1
    action_hist[state[0] - 1][action - 1] += 1

    return action
Ejemplo n.º 29
0
def agent_start(this_observation
                ):  # returns NumPy array, this_observation: NumPy array
    global last_action, action_times, estimate_values, op_values  #,op_init
    #op_init=0
    action_times = np.zeros(10)
    estimate_values = np.zeros(10)
    op_values = np.zeros(10)
    if this_observation[0] != 0:
        for i in range(num_actions):
            op_values[i] = this_observation[0]
        local_action = np.zeros(1)
        local_action[0] = rand_in_range(num_actions)
        return local_action
        #last_action[0] = rand_in_range(num_actions)
    local_action = np.zeros(1)
    local_action[0] = rand_in_range(num_actions)
    last_action[0] = local_action[0]
    return local_action
Ejemplo n.º 30
0
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    global Q, last_action, last_y, last_x, model, theta, PQueue, model2
    """
	Arguments: reward: floting point, state: integer
	Returns: action: floating point
	"""
    # select an action, based on Q

    # s'x and y
    x = state[0]
    y = state[1]

    modelKey = (last_x, last_y, last_action)
    model[modelKey] = (reward, x, y)

    model2Key = (x, y)
    if (reward, last_x, last_y, last_action) not in model2[model2Key]:
        model2[model2Key].append((reward, last_x, last_y, last_action))

    p = reward + gamma * np.max(Q[x][y]) - Q[last_x][last_y][last_action]

    if p > theta:
        PQueue.put((p, last_x, last_y, last_action))

    i = 0
    while i < 5 and not PQueue.empty():
        i += 1
        firstTuple = PQueue.get()
        key = firstTuple[1:4]
        (modelX, modelY, modelA) = key
        (modelReward, modelNextX, modelNextY) = model[key]

        Q[modelX][modelY][modelA] += alpha_step * (
            modelReward + gamma * np.max(Q[modelNextX][modelNextY]) -
            Q[modelX][modelY][modelA])

        key2 = (modelX, modelY)
        for item in model2[key2]:
            (previousReward, previousX, previousY, previousA) = item
            p = previousReward + gamma * np.max(
                Q[modelX][modelY]) - Q[previousX][previousY][previousA]
            if p > theta:
                PQueue.put((p, previousX, previousY, previousA))

    action_index = np.argmax(Q[x][y])  #find best action
    if Q[x][y][action_index] == 0:
        action_index = rand_in_range(4)

    last_x = x
    last_y = y
    last_action = action_index
    action = actions[action_index]

    return action