def agent_start(self, state): """ Arguments: state - numpy array Returns: action - integer """ self.Z = np.zeros(self.memorySize) F0 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[0]) F1 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[1]) F2 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[2]) Q0 = 0 Q1 = 0 Q2 = 0 for i in F0: Q0 += self.weight[i] for i in F1: Q1 += self.weight[i] for i in F2: Q2 += self.weight[i] #Q0 = self.weight[F0].sum() #Q1 = self.weight[F1].sum() #Q2 = self.weight[F2].sum() self.state = state A = np.argmax([Q0, Q1, Q2]) temp = [F0,F1,F2] self.F = temp[A] return A
def question_3(): # Specify hyper-parameters agent = Agent() environment = Environment() rlglue = RLGlue(environment, agent) num_episodes = 1000 num_runs = 1 max_eps_steps = 1000000 for _ in range(num_runs): rlglue.rl_init() i = 0 for i in range(num_episodes): rlglue.rl_episode(max_eps_steps) print(i) fout = open('value', 'w') steps = 50 w, iht = rlglue.rl_agent_message("ValueFunction") Q = np.zeros([steps, steps]) for i in range(steps): for j in range(steps): values = [] for a in range(3): value = 0 for index in tiles(iht, 8, [ 8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 * (-0.07 + (j * 0.14 / steps)) / 0.14 ], [a]): value -= w[index] values.append(value) height = max(values) fout.write(repr(height) + ' ') Q[j][i] = height fout.write('\n') fout.close() np.save("value", Q)
def agent_message(self, in_message): """ Arguments: in_message - string Returns: The value function as a list. This function is complete. You do not need to add code here. """ if in_message == 'Q3': values = np.zeros((50,50)) steps = 50 numActions = 3 for i in range(steps): for j in range(steps): Q = np.zeros(3) for a in range(numActions): inds = tile.tiles(self.iht,8,[(-1.2 + (i * 1.7 / steps)),(-0.07 + (j * 0.14 / steps))], [a]) total_weight = np.sum(self.weight[inds]) Q[a] = total_weight a = np.max(Q) values[i][j] = -a np.save('values', values) else: return "I dont know how to respond to this message!!"
def mytiles(self, x, x_dot, action): scaling_factor_x = 8 / (1.7) scaling_factor_xdot = 8 / (0.14) x = x + 1.2 x_dot = x_dot + 0.07 x = x * scaling_factor_x x_dot = x_dot * scaling_factor_xdot return tiles(self.iht, self.offset_t, [x, x_dot], [action])
def getActiveTiles(self, state): tileVectors = [] scaleX = 0.5 + 1.2 scaleY = 0.07 + 0.07 tile = self.getTileScale(state) for i in range(0, 3): selectedTiles = tiles(self.iht, 8, tile, [i]) tileVectors.append(selectedTiles) return tileVectors
def agent_step(self, reward, state): """ Arguments: reward - floting point, state - numpy array Returns: action - integer """ delta = reward F = self.F for i in F: delta = delta - self.weight[i] self.Z[i] = 1 F0 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[0]) F1 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[1]) F2 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[2]) Q0 = 0 Q1 = 0 Q2 = 0 for i in F0: Q0 += self.weight[i] for i in F1: Q1 += self.weight[i] for i in F2: Q2 += self.weight[i] #Q0 = self.weight[F0].sum() #Q1 = self.weight[F1].sum() #Q2 = self.weight[F2].sum() A = np.argmax([Q0, Q1, Q2]) temp = [F0,F1,F2] for i in temp[A]: delta += self.gamma*self.weight[i] self.weight += self.alpha*delta*self.Z self.Z *= self.gamma*self.lam self.F = temp[A] return A
def get_action(self, state): if np.random.random() < self.eps: return np.random.choice(self.actions) else: value = np.asarray([ self.w[tile3.tiles(self.iht, self.num_tiling, state * self.reshape, [a])].sum() for a in self.actions ], dtype=float) return self.actions[np.random.choice( np.flatnonzero(value == value.max()))]
def agent_end(self, reward): """ Run when the agent terminates. Args: reward (float): the reward the agent received for entering the terminal state. """ features = tile3.tiles(self.iht, self.num_tiling, self.s_prev * self.reshape, [self.a_prev]) delta = reward - self.w[features].sum() self.replacing_trace[features] = 1 self.w += self.alpha * delta * self.replacing_trace
def generateFeatures(self, observation, action): # print(state[1]) # print(action) if (observation[0], observation[1], action) not in self.features: positionScale = 8 / (0.5 + 1.2) velocityScale = 8 / (0.07 + 0.07) self.features[observation[0], observation[1], action] = tiles(self.iht, NUM_TILINGS, [ observation[0] * positionScale, observation[1] * velocityScale ], [action]) else: pass
def agent_step(self, reward, state): """ A step taken by the agent. Args: reward (float): the reward received for taking the last action taken state (state observation): The agent's current state Returns: The action the agent is taking. """ features = tile3.tiles(self.iht, self.num_tiling, self.s_prev * self.reshape, [self.a_prev]) self.replacing_trace[features] = 1 action = self.get_action(state) new_features = tile3.tiles(self.iht, self.num_tiling, state * self.reshape, [action]) delta = reward - self.w[features].sum( ) + self.gamma * self.w[new_features].sum() self.w += self.alpha * delta * self.replacing_trace self.replacing_trace *= self.gamma * self.lam self.s_prev = state.copy() self.a_prev = action return action
def compute_for_3d_plot(self): steps = 50 values = np.zeros((steps, steps)) i_values = np.linspace(-1.2, 0.5, steps) j_values = np.linspace(-0.07, 0.07, steps) for i in range(steps): for j in range(steps): values[i, j] = -max([ self.w[tile3.tiles( self.iht, self.num_tiling, np.array([i_values[i], j_values[j]]) * self.reshape, [a])].sum() for a in self.actions ]) return [i_values, j_values, values]
def plot3DGraph(self): step_size = 50 # scaleX = 0.5 + 1.2 # scaleY = 0.07 + 0.07 f = open('plotValues.txt', 'w') for i in range(step_size): pos = -1.2 + (i * 1.7 / step_size) for j in range(step_size): vel = -0.07 + (j * 0.14 / step_size) values = [] for a in range(0, 3): tile = self.getTileScale([pos, vel]) stateVector = np.zeros(2048) # [(-1.2+(i*1.7/step_size))*scaleX,(-0.07+(j*0.14/step_size))*scaleY] inds = tiles(self.iht, 8, tile, [a]) for element in inds: stateVector[element] = 1 values.append(np.dot(stateVector, self.weights)) height = max(values) f.write(repr(-height) + " ") f.write("\n") f.close() plotGraph3D()
def mytiles(position, velocity, action=[]): scale_P = 5 / 1.7 scale_V = 5 / .14 return tiles(iht, numTilings, list( (position * scale_P, velocity * scale_V)), action)
def choose(self, state): # find the value of each action temp1 = tiles(self.iht, 8, [state[0] * self.pos_scale, state[1] * self.vel_scale], [0]) a = 0 b = 0 c = 0 d = [a, b, c] for i in temp1: a = a + self.w[i] temp2 = tiles(self.iht, 8, [state[0] * self.pos_scale, state[1] * self.vel_scale], [1]) for i in temp2: b = b + self.w[i] temp3 = tiles(self.iht, 8, [state[0] * self.pos_scale, state[1] * self.vel_scale], [2]) for i in temp3: c = c + self.w[i] all = [temp1, temp2, temp3] # Tie breaking if the actions have same values and return a random one from the ties if (a == b) or (b == c) or (a == c): # if their all equal if (a == b) and (b == c): r = np.random.randint(0, 3) return r, np.array(all[r]) # if a=b and c is less than both if (a == b) and c < a: r = np.random.randint(0, 2) return r, np.array(all[r]) # if c is the largest elif (a == b) and a < c: return 2, np.array(all[2]) # if b=c and a < c if (b == c) and a < c: r = np.random.randint(1, 3) return r, np.array(all[r]) # if a is the largest elif (b == c) and c < a: return 0, np.array(all[0]) # if a == c if b > a: return 1, np.array(all[1]) else: temp = [0, 2] r = temp[np.random.randint(0, 2)] return r, np.array(all[r]) # return max action and its weight vector index temp = [a, b, c] max = a idx = 0 for i in range(1, 3): if temp[i] > max: max = temp[i] idx = i return idx, np.array(all[idx])
def mytiles(x, y): scaleFactor = 20/(2*np.pi) return tiles(iht, numTilings, list((x*scaleFactor,y*scaleFactor)))
def mytiles(self, position, action): a = [ self.numTilings * position[0] / 1.7, self.numTilings * position[1] / 0.14 ] return tiles(self.iht, self.numTilings, a, [action])