class Agent: def __init__(self): self.states = [] # record position and action taken at the position self.actions = ["up", "down", "left", "right"] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd self.lr = 0.2 self.exp_rate = 0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict def chooseAction(self): # choose action with most expected value mx_nxt_reward = 0 action = "" if np.random.uniform(0, 1) <= self.exp_rate: action = np.random.choice(self.actions) else: # greedy action for a in self.actions: current_position = self.grid_world.state nxt_reward = self.Q_values[current_position][a] if nxt_reward >= mx_nxt_reward: action = a mx_nxt_reward = nxt_reward # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action)) return action def takeAction(self, action): position = self.grid_world.nxtPosition(action) # update GridWorld return GridWorld(state=position) def reset(self): self.states = [] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd def play(self, rounds=10): i = 0 while i < rounds: # to the end of game back propagate reward if self.grid_world.isEnd: # back propagate reward = self.grid_world.giveReward() for a in self.actions: self.Q_values[self.grid_world.state][a] = reward print("Game End Reward", reward) for s in reversed(self.states): current_q_value = self.Q_values[s[0]][s[1]] reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value) self.Q_values[s[0]][s[1]] = round(reward, 3) self.reset() i += 1 else: action = self.chooseAction() # append trace self.states.append([(self.grid_world.state), action]) print("current position {} action {}".format(self.grid_world.state, action)) # by taking the action, it reaches the next state self.grid_world = self.takeAction(action) # mark is end self.grid_world.isEndFunc() print("nxt state", self.grid_world.state) print("---------------------") self.isEnd = self.grid_world.isEnd
class Agent: def __init__(self): self.states = [] # record position and action taken at the position self.actions = ["up", "down", "left", "right"] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd self.lr = 0.2 self.exp_rate = 0.3#-1#0.3#0.3#0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict self.state_values_vec = np.zeros((len(self.Q_values))) self.policy_list = list() def chooseAction(self): # choose action with most expected value mx_nxt_reward = -np.inf action = "" if np.random.uniform(0, 1) <= self.exp_rate: action = np.random.choice(self.actions) else: # greedy action for a in self.actions: current_position = self.grid_world.state nxt_reward = self.Q_values[current_position][a] if nxt_reward >= mx_nxt_reward: action = a mx_nxt_reward = nxt_reward # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action)) return action def takeAction(self, action): position = self.grid_world.nxtPosition(action) # update GridWorld return GridWorld(state=position) def reset(self): self.states = [] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd def play(self, rounds=10): i = 0 histories = list() data_sample = 0 while i < rounds: # to the end of game back propagate reward if self.grid_world.isEnd: # back propagate reward = self.grid_world.giveReward() for a in self.actions: self.Q_values[self.grid_world.state][a] = reward #print("Game End Reward", reward) next_state = (self.grid_world.state, 'right') for s in reversed(self.states): next_q_value = self.Q_values[next_state[0]][next_state[1]] current_q_value = self.Q_values[s[0]][s[1]] #reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value) reward = current_q_value + self.lr * (self.decay_gamma * next_q_value - current_q_value) self.Q_values[s[0]][s[1]] = round(reward, 3) next_state = s self.policy_list.append(self.make_policy_table_from_q()) self.reset() i += 1 aa = [[k for e, k in e.items()] for e in [v for k, v in self.Q_values.items()]] ab = np.array(aa, dtype=float) q_value_numpy = np.max(ab, axis=1, keepdims=False) diff = np.linalg.norm(self.state_values_vec[:] - q_value_numpy[:]) self.state_values_vec = q_value_numpy #print("iter {0}".format(i)) #print("diff {0}".format(diff)) #print("qvalue {0}".format(np.linalg.norm(q_value_numpy[:]))) print("{0}".format(data_sample)) value_scalar= np.linalg.norm(q_value_numpy[:]) histories.append((value_scalar, diff, int(data_sample))) data_sample = 0 else: action = self.chooseAction() # append trace self.states.append([(self.grid_world.state), action]) #s, a #print("current position {} action {}".format(self.grid_world.state, action)) # by taking the action, it reaches the next state self.grid_world = self.takeAction(action) # mark is end self.grid_world.isEndFunc() #print("nxt state", self.grid_world.state) #print("---------------------{0}".format(i)) self.isEnd = self.grid_world.isEnd data_sample += 1 return histories, self.policy_list def make_policy_table_from_q(self): convert_list = list() for k, v in self.Q_values.items(): convert_list.append(np.argmax(np.array([val for (key, val) in v.items()], float))) return convert_list