def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: integer """ # select an action, based on Q global last_x, last_y, last_action, Model, visited, Q x, y = state Q[last_x, last_y, last_action] += alpha * (reward + gamma * np.max(Q[x, y, :]) - Q[last_x, last_y, last_action]) Model[last_x, last_y, last_action, 0] = x Model[last_x, last_y, last_action, 1] = y Model[last_x, last_y, last_action, 2] = reward visited.append((last_x, last_y, last_action)) Dyna_Q() #action = np.argmax(Q[x, y, :]) action = np.random.choice(np.where(Q[x, y, :] == Q[x, y, :].max())[0]) if rand_un() < epsilon: action = rand_in_range(4) #print state, action last_x = x last_y = y last_action = action return last_action
def agent_step(reward, state): global actions, old_state, old_action, iht, weights, z #tile-coding scaled_ns1 = 1. * NUM_TILES * (state[0] - POSITION[0]) / (POSITION[1] - POSITION[0]) scaled_ns2 = 1. * NUM_TILES * (state[1] - VELOCITY[0]) / (VELOCITY[1] - VELOCITY[0]) hash_s = old_state hash_ns = np.asarray(tiles(iht, NUM_TILINGS, [scaled_ns1, scaled_ns2])) #epsilon-greedy rand = rand_un() if rand < EPSILON: n_action = random.choice(actions) else: n_action = np.argmax(np.sum(weights[:, hash_ns], axis=1)) #learning and update traces q = np.sum(weights[old_action, hash_s]) nq = np.sum(weights[n_action, hash_ns]) z[old_action, hash_s] = 1. weights += ALPHA * (reward + GAMMA * nq - q) * z z *= GAMMA * LAMBDA old_state = hash_ns old_action = n_action return n_action
def agent_step( reward, this_observation ): # returns NumPy array, reward: floating point, observation_t: NumPy array global local_action, last_observation, this_action, action_value_estimates, action_counts, time_step, C #Update estimate for current action cur_action = int(this_action[0]) action_value_estimates[cur_action] += alpha * ( reward - action_value_estimates[cur_action]) #Choose a new action using the parameter based agents stp1 = this_observation[0] action_selection_prob = rand_un() if action_selection_prob <= (1 - epsilon): atp1 = action_value_estimates.index(max(action_value_estimates)) elif episode == EPSILON_GREEDY: atp1 = randInRange(numActions) else: print("BAD EPISODE: NO ACTION SELECTION FOR THE CURRENT AGENT!!!") exit() action_counts[atp1] += 1 time_step += 1 local_action[0] = atp1 this_action = local_action last_observation = this_observation return this_action
def agent_step(reward, state): global cur_state, cur_action, weights, e_trace next_state = state #Update the weights delta = reward cur_state_feature_indices = approx_value(cur_state, cur_action, weights)[1] for index in cur_state_feature_indices: delta = delta - weights[0][index] e_trace[0][index] = 1 #Choose the next action, epislon-greedy style if rand_un() < 1 - EPSILON: actions = [ approx_value(cur_state, action, weights)[0] for action in range(NUM_ACTIONS) ] next_action = actions.index(max(actions)) else: next_action = rand_in_range(NUM_ACTIONS) next_state_feature_indices = approx_value(next_state, next_action, weights)[1] for index in next_state_feature_indices: delta = delta + GAMMA * weights[0][index] weights += ALPHA * delta * e_trace e_trace = GAMMA * LAMBDA * e_trace cur_state = next_state cur_action = next_action return cur_action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: integer """ global alpha, gamma, actions_permitted, Q, action_list, last_action, last_state # select an action, based on Q if rand_un() < epsilon: action = action_list[rand_in_range(actions_permitted)] else: action = action_list[np.argmax(Q[int(state[0])][int(state[1])])] Q[last_state[0], last_state[1], find_action(last_action)] += alpha * ( reward + gamma * Q[int(state[0]), int(state[1]), find_action(action)] - Q[last_state[0], last_state[1], find_action(last_action)]) last_action = action last_state = state return action
def agent_step(reward, state): global q_values, actions, old_info, reward_buffer, gammas x = state[0][0] y = state[0][1] hash_state = y * 10 + x #epsilon greedy rand = rand_un() if rand <= EPSILON: action = rand_in_range(len(actions)) else: action = np.argmax(q_values[hash_state, :]) #learning reward_buffer.append(reward) if len(old_info) >= N: old_state = old_info[0][0] old_action = old_info[0][1] q_values[old_state, old_action] += ALPHA * (np.sum(gammas * np.asarray(reward_buffer))+\ (GAMMA**(N)) * q_values[hash_state, action] - q_values[old_state, old_action]) old_info.pop(0) reward_buffer.pop(0) old_info.append((hash_state, action)) return actions[action]
def pick_action(arr, epsilon, num_actions): arg_max = np.argmax(arr) if rand_un() < epsilon: action = rand_in_range(num_actions) else: action = arg_max return action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array global Q, actions, last_action, last_y, last_x """ Arguments: reward: floting point, state: integer Returns: action: floating point """ # select an action, based on Q # s'x and y x = state[0] y = state[1] if rand_un() < epsilon: action_index = rand_in_range(num_action) else: action_index = np.argmax(Q[x][y]) #find best action #update last step's Q Q[last_x][last_y][last_action] += alpha_step * ( reward + Q[x][y][action_index] - Q[last_x][last_y][last_action]) last_x = x last_y = y last_action = action_index action = actions[action_index] return action
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: integer """ # select an action, based on Q global Q, last_action, S, S_, model, previous_states S_ = state if (S[0],S[1]) not in previous_states: previous_states[(S[0],S[1])] = set() previous_states[(S[0],S[1])].add(last_action) Q[S[0]][S[1]][last_action] += alpha * (reward + gamma * max(Q[S_[0]][S_[1]]) - Q[S[0]][S[1]][last_action]) model[S[0]][S[1]][last_action] = (reward, S_[0], S_[1]) for i in range(n): S_planning = random.choice(previous_states.keys()) A_planning = random.sample(previous_states[S_planning], 1) reward_planning, x_planning, y_planning = model[S_planning[0]][S_planning[1]][A_planning[0]] Q[S_planning[0]][S_planning[1]][A_planning[0]] += alpha * (reward_planning + gamma * max(Q[x_planning][y_planning]) - Q[S_planning[0]][S_planning[1]][A_planning[0]]) if rand_un() < epsilon: action = rand_in_range(4) else: action = argmax(Q[S[0]][S[1]]) S = S_ last_action = action return action
def agent_start(state): global Q,last_action,last_y,last_x,model """ Hint: Initialize the variavbles that you want to reset before starting a new episode Arguments: state: numpy array Returns: action: integer list """ # pick the first action, don't forget about exploring starts x = state[0] y = state[1] if rand_un() < epsilon: action_index = rand_in_range(num_action) else: action_index = np.argmax(Q[x][y]) #find best action if Q[x][y][action_index] == 0: action_index = rand_in_range(4) last_action = action_index last_x = x last_y = y action = actions[action_index] return action
def agent_step(reward, state): global q_values, actions, trajectory x = state[0][0] y = state[0][1] hash_state = y * 10 + x #epsilon greedy rand = rand_un() if rand <= EPSILON: action = rand_in_range(len(actions)) else: action = np.argmax(q_values[hash_state, :]) #print state, action #learning n = len(trajectory) for i, (s, a) in enumerate(trajectory): #Wn = math.exp(-0.5*(g - (r+self.values[ns]))**2) q_values[s, a] += 1. / N[s, a] * (GAMMA**(n - 1 - i)) * ( reward + GAMMA * q_values[hash_state, action] - q_values[s, a]) old_state = hash_state old_action = action #if (hash_state, action) not in trajectory: trajectory.append((hash_state, action)) N[hash_state, action] += 1 return actions[action]
def env_step(action): global current_state reward = 0.0 is_terminal = False #go left if action == 'left': left_stop = max(0, current_state[0] - 100) left_range = current_state[0] - left_stop left_prob = 1. * (100 - (left_range - 1)) / 100 rand = rand_un() if rand <= left_prob: current_state[0] = left_stop else: current_state[0] = random.choice( range(left_stop + 1, current_state[0])) #go right else: right_stop = min(1001, current_state[0] + 100) right_range = right_stop - current_state[0] right_prob = 1. * (100 - (right_range - 1)) / 100 rand = rand_un() if rand <= right_prob: current_state[0] = right_stop else: current_state[0] = random.choice( range(current_state[0] + 1, right_stop)) if current_state[0] == 0: is_terminal = True current_state = None reward = -1.0 elif current_state[0] == 1001: is_terminal = True current_state = None reward = 1.0 result = { "reward": reward, "state": current_state, "isTerminal": is_terminal } return result
def action_select(s): if rand_un() < e: #explore return rand_in_range(4) else: #decide action based on policy return np.random.choice( np.where(Q[s[0]][s[1]] == np.amax(Q[s[0]][s[1]]))[0])
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: should be a constant -1 until episode termination Returns action as 0,1,or 2 """ global old_state, iht, curr_state, last_action, alpha, w, gamma, num_tilings, tile_width, epsilon, z, lambd action = None curr_state = state err = reward #unpack the np arrays x,xdot = state o_x, o_xdot = old_state #calculate indices of affected features of curr_state and old_state features_old = tiles(iht,num_tilings,[((o_x/(0.5+1.2))*(1/tile_width)), ((o_xdot/(0.07+0.07))*(1/tile_width))],[last_action]) #x(s) components. for i in features_old: err -= w[i] z[i] = 1.0 ######## ACTION SELECTION ######## ExploreExploit = rand_un() if ExploreExploit < epsilon: action = random.randrange(0,3) # Explore else: ## GREEDY ACTION ## max_q = -999999999.9 #approx -inf for i in range(0,3): #number of poss actions Xs = tiles(iht,num_tilings,[((x/(0.5+1.2))*(1/tile_width)), ((xdot/(0.07+0.07))*(1/tile_width))], [i]) #x(s') components. est_q = 0.0 for j in Xs: est_q += w[j] if est_q > max_q: max_q = est_q action = i features_curr = tiles(iht,num_tilings,[((x/(0.5+1.2))*(1/tile_width)), ((xdot/(0.07+0.07))*(1/tile_width))], [action]) #x(s') components. for i in features_curr: err += gamma*w[i] w += alpha*err*z z = z*gamma*lambd old_state = state last_action = action return action
def agent_step(reward, state): global state_action_values, cur_state, cur_action, cur_epsilon next_state = state #Choose the next action, epsilon greedy style if AGENT == TABULAR: if rand_un() < 1 - cur_epsilon: #Need to ensure that an action is picked uniformly at random from among those that tie for maximum cur_max = state_action_values[state[0]][state[1]][0] max_indices = [0] for i in range(1, len(state_action_values[state[0]][state[1]])): if state_action_values[state[0]][state[1]][i] > cur_max: cur_max = state_action_values[state[0]][state[1]][i] max_indices = [i] elif state_action_values[state[0]][state[1]][i] == cur_max: max_indices.append(i) next_action = max_indices[rand_in_range(len(max_indices))] else: next_action = rand_in_range(NUM_ACTIONS) #Update the state action values next_state_max_action = state_action_values[next_state[0]][next_state[1]].index(max(state_action_values[next_state[0]][next_state[1]])) state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + GAMMA * state_action_values[next_state[0]][next_state[1]][next_state_max_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action]) elif AGENT == NEURAL: #Choose the next action, epsilon greedy style if rand_un() < 1 - cur_epsilon: #Get the best action over all actions possible in the next state, q_vals = model.predict(encode_1_hot(next_state), batch_size=1) q_max = np.max(q_vals) next_action = np.argmax(q_vals) cur_action_target = reward + GAMMA * q_max #Get the value for the current state for which the action was just taken cur_state_1_hot = encode_1_hot(cur_state) q_vals = model.predict(cur_state_1_hot, batch_size=1) q_vals[0][cur_action] = cur_action_target model.fit(cur_state_1_hot, q_vals, batch_size=1, epochs=1, verbose=0) else: next_action = rand_in_range(NUM_ACTIONS) cur_state = next_state cur_action = next_action return next_action
def agent_start(state): global state_action_values, cur_state, cur_action #Choose the next action, epsilon greedy style cur_state = state if rand_un() < 1 - EPSILON: cur_action = state_action_values[state[0]][state[1]].index(max(state_action_values[state[0]][state[1]])) else: cur_action = rand_in_range(NUM_ACTIONS) return cur_action
def pick_action(self): if self._last_action == None: self.pick_first_action() # arg_max = self._find_max(self._Q) arg_max = np.argmax(self._Q) if rand_un() < self._epsilon: action = rand_in_range(self._num_actions) else: action = arg_max self._last_action = action # print('action is {}'.format(self._last_action)) return action
def agent_step(reward, state): # returns NumPy array, reward: floating point, this_observation: NumPy array global Q,last_action,last_y,last_x,model """ Arguments: reward: floting point, state: integer Returns: action: floating point """ # select an action, based on Q # s'x and y x = state[0] y = state[1] Q[last_x][last_y][last_action] += alpha_step * (reward+ gamma*np.max(Q[x][y]) - Q[last_x][last_y][last_action]) modelKey = (last_x,last_y,last_action) model[modelKey] = [reward,x,y] i = 0 while i < n: i += 1 chosen = False while not chosen: modelX = rand_in_range(9) modelY = rand_in_range(6) modelA = rand_in_range(4) if model[(modelX,modelY,modelA)][0] != -1.0: chosen = True modelNextY = model[(modelX,modelY,modelA)][2] modelNextX = model[(modelX,modelY,modelA)][1] modelReward = model[(modelX,modelY,modelA)][0] Q[modelX][modelY][modelA] += alpha_step * (modelReward +gamma*np.max(Q[modelNextX][modelNextY]) - Q[modelX][modelY][modelA]) if rand_un() < epsilon: action_index = rand_in_range(num_action) else: action_index = np.argmax(Q[x][y]) #find best action if Q[x][y][action_index] == 0: action_index = rand_in_range(4) last_x = x last_y = y last_action = action_index action = actions[action_index] return action
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array global pre, policy, ph, e if int(state) > int(pre[0]): ph[0] += 1 ph[1] += 1 policy_create(pre, reward, state) if rand_un() > e: action = 1 + int(rand_un() * min(state, 99 - state + 1)) else: action = int(policy[state]) pre[0] = state pre[1] = action """ Arguments: reward: floting point, state: integer Returns: action: integer """ # select an action, based on Q return action
def agent_start(state): global pre """ Hint: Initialize the variavbles that you want to reset before starting a new episode Arguments: state: numpy array Returns: action: integer """ # pick the first action, don't forget about exploring starts action = 1 + int(rand_un() * min(state, 99 - state + 1)) pre[0] = state pre[1] = action return action
def agent_step(reward, state): global state_action_values, cur_state, cur_action next_state = state if AGENT == TABULAR: #Choose the next action, epsilon greedy style cur_state = state if rand_un() < 1 - EPSILON: next_action = state_action_values[state[0]][state[1]].index(max(state_action_values[state[0]][state[1]])) else: next_action = rand_in_range(NUM_ACTIONS) #Update the state action values next_state_max_action = state_action_values[next_state[0]][next_state[1]].index(max(state_action_values[next_state[0]][next_state[1]])) state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + GAMMA * state_action_values[next_state[0]][next_state[1]][next_state_max_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action]) #print(state_action_values) elif AGENT == NEURAL: #Choose the next action, epsilon greedy style if rand_un() < 1 - EPSILON: #Get the best action over all actions possible in the next state, q_vals = model.predict(encode_1_hot(next_state), batch_size=1) q_max = np.max(q_vals) next_action = np.argmax(q_vals) cur_action_target = reward + GAMMA * q_max #Get the value for the current state for which the action was just taken cur_state_1_hot = encode_1_hot(cur_state) q_vals = model.predict(cur_state_1_hot, batch_size=1) q_vals[0][cur_action] = cur_action_target model.fit(cur_state_1_hot, q_vals, batch_size=1, epochs=1, verbose=0) else: next_action = rand_in_range(NUM_ACTIONS) #print(AGENT) #print(next_action) cur_state = next_state cur_action = next_action return next_action
def agent_step(reward, state): global old_action, old_state if rand_un() < e: action = rand_in_range(movement) else: action = np.argmax(Q[state[0]][state[1]]) Q[old_state[0]][old_state[1]][old_action] += a * ( reward + Q[state[0]][state[1]][action] - Q[old_state[0]][old_state[1]][old_action]) old_state[0] = state[0] old_state[1] = state[1] old_action = action return action
def agent_start(this_observation): # returns NumPy array, this_observation: NumPy array global last_action local_action = np.zeros(1) if rand_un() < 0: # you may change it to 0 or 0.1 local_action[0] = rand_in_range(num_actions) else: local_action[0] = findGreedyAction() last_action = local_action return local_action[0]
def choose_action(state): global Q mx = np.amax(Q[state[0], state[1], :]) arg = np.argwhere(Q[state[0], state[1], :] == mx) inds = np.reshape(arg, np.size(arg)) greedy_action = np.random.choice(inds) toss = rand_un() if toss > epsilon: action = greedy_action else: action = rand_in_range(NUMBER_OF_ACTIONS) return action
def agent_start(state): global cur_state, cur_action, weights cur_state = state #Choose the next action, epislon-greedy style if rand_un() < 1 - EPSILON: actions = [ approx_value(cur_state, action, weights)[0] for action in range(NUM_ACTIONS) ] cur_action = actions.index(max(actions)) else: cur_action = rand_in_range(NUM_ACTIONS) return cur_action
def agent_step(reward, state): global state_action_values, cur_state, cur_action next_state = state #Choose the next action, epsilon greedy style if rand_un() < 1 - EPSILON: next_action = state_action_values[state[0]][state[1]].index(max(state_action_values[state[0]][state[1]])) else: next_action = rand_in_range(NUM_ACTIONS) #Update the state action value function state_action_values[cur_state[0]][cur_state[1]][cur_action] += ALPHA * (reward + state_action_values[next_state[0]][next_state[1]][next_action] - state_action_values[cur_state[0]][cur_state[1]][cur_action]) cur_state = next_state cur_action = next_action return next_action
def agent_start(state): global actions, old_state, old_action, q_values x = state[0][0] y = state[0][1] hash_state = y * 10 + x # there are 70 states in total rand = rand_un() if rand <= EPSILON: action = rand_in_range(len(actions)) else: action = np.argmax(q_values[hash_state, :]) old_state = hash_state old_action = action return actions[action]
def agent_start(state): """ Hint: Initialize the variavbles that you want to reset before starting a new episode Arguments: state: numpy array Returns: action: integer """ global actions_permitted, Q, action_list, last_action, last_state # pick the first action, don't forget about exploring starts if rand_un() < epsilon: action = action_list[rand_in_range(actions_permitted)] else: action = action_list[np.argmax(Q[state[0]][state[1]])] last_action = action last_state = state return action
def agent_start(state): global actions, old_info, reward_buffer x = state[0][0] y = state[0][1] hash_state = y * 10 + x rand = rand_un() if rand <= EPSILON: action = rand_in_range(len(actions)) else: action = np.argmax(q_values[hash_state, :]) old_info = [] old_info.append((hash_state, action)) reward_buffer = [] return actions[action]
def env_step( this_action ): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global local_observation, this_reward_observation #, nStatesSimpleEnv episode_over = False the_reward = randn(0.0, 1.0) # rewards drawn from (0, 1) Gaussian atp1 = this_action[0] # how to extact action stp1 = randInRange( nStatesSimpleEnv) # state transitions are uniform random if rand_un() < 0.05: episode_over = True # termination is random local_observation[0] = stp1 this_reward_observation = (the_reward, this_reward_observation[1], episode_over) return this_reward_observation