class Agent: def __init__(self): self.actions = ["up", "down", "left", "right"] self.num_actions = len(self.actions) self.grid_world = GridWorld() # initial state reward self.state_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.state_values[(i, j)] = 0 # set initial value to 0 self.state_indices = {} k = 0 for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.state_indices[(i, j)] = k # set initial value to 0 k += 1 self.num_states = len(self.state_values) self.state_values_vec = np.zeros((self.num_states)) self.rewards = np.zeros((self.num_states)) self.state_transition_prob = np.zeros((self.num_states, self.num_actions, self.num_states)) #self.get_observation_by_random(20000) self.discount = 0.99 def count_base_reward(self, state): """ To count state, reward in observation sequence """ count_s = [exp for exp in self.observation if exp[0] == state] count_s_r = [exp[2] for exp in self.observation if exp[0] == state] try: r = sum(count_s_r)/len(count_s) except Exception as e: r = 0 print(sum(count_s_r)) return r def count_base_prob(self, state, action, nxtState): """ To count state, action nxtState in observation sequence """ # m -1 loop count_s_a = [ exp for exp in self.observation[:len(self.observation)-1] if exp[0] == state and exp[1] == action] count_s_a_nxt = [ (i,exp, self.observation[i+1][0]) for i,exp in enumerate(self.observation[:len(self.observation)-1]) if exp[0] == state and exp[1] == action and self.observation[i+1][0] == nxtState ] if action == 'down' and nxtState == (1,0): print("down and (1,0)") try: p = len(count_s_a_nxt)/len(count_s_a) except Exception as e: p = 0 print(count_s_a) return p def get_observation_by_random(self, num_of_samples): """ Get observation sequecne with random moving agent """ observation = [] #Observe experience s1, r1, a1, s2, r2, a2, sm, rm, am for _ in range(num_of_samples): s_a_r = list() r_action = self.choose_random_action() s_a_r.append(self.grid_world.state) s_a_r.append(r_action) reward = self.grid_world.giveReward() s_a_r.append(reward) print("current position {} action {}".format(self.grid_world.state, r_action)) # by taking the action, it reaches the next state self.grid_world = self.takeAction(r_action) # if(reward == 1): print("reward 1") observation.append(s_a_r) self.observation = observation self.make_transition_matrix() def make_transition_matrix(self): """ To calulate Transition matrix p(s'|s,a) """ for state in self.state_values.keys(): self.rewards[self.state_indices[state]] = self.count_base_reward(state) for action in self.actions: for nxtState in self.state_values.keys(): # p, r p = self.count_base_prob(state, action, nxtState) self.state_transition_prob[ self.state_indices[state], self.actions.index(action), self.state_indices[nxtState]] += p def takeAction(self, action): position = self.grid_world.nxtPosition(action) # update GridWorld return GridWorld(state=position) def choose_random_action(self): # choose_random_action action = np.random.choice(self.actions) return action def update(self): # Compute the action values $Q(s,a)$ _action_values = np.repeat(self.rewards, 4).reshape((self.num_states, 4)) + self.discount * np.sum( self.state_transition_prob * self.state_values_vec, axis=2, keepdims=False ) # Evaluate the deterministic policy $\pi(s)$ self.policy = np.argmax(_action_values, axis=1) # Compute the values $V(s)$ values = np.max(_action_values, axis=1, keepdims=False) # Compute the value difference $|\V_{k}-V_{k+1}|\$ for check the convergence diff = np.linalg.norm(self.state_values_vec[:] - values[:]) # Update the current value estimate self.state_values_vec = values return diff, values def nxtPosition(self, state, action): if action == "up": nxtState = (state[0] - 1, state[1]) elif action == "down": nxtState = (state[0] + 1, state[1]) elif action == "left": nxtState = (state[0], state[1] - 1) else: nxtState = (state[0], state[1] + 1) if (nxtState[0] >= 0) and (nxtState[0] <= 2): if (nxtState[1] >= 0) and (nxtState[1] <= 3): if nxtState != (1, 1): return nxtState return state # def giveReward(self, state): # if state == WIN_STATE: # return 1 # elif state == LOSE_STATE: # return -1 # else: # return 0 def fit(self, max_iteration=1e3, tolerance=1e-3, verbose=False, logging=False): if logging: history = [] # Value iteration loop for _iter in range(1, int(max_iteration + 1)): # Update the value estimate diff, values = self.update() if logging: history.append(diff) if verbose: print('Iteration: {0}\tValue difference: {1}'.format(_iter, diff)) # Check the convergence if diff < tolerance: if verbose: print('Converged at iteration {0}.'.format(_iter)) break if logging: return diff, history, values, self.policy else: return diff
class Agent: def __init__(self): self.states = [] # record position and action taken at the position self.actions = ["up", "down", "left", "right"] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd self.lr = 0.2 self.exp_rate = 0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict def chooseAction(self): # choose action with most expected value mx_nxt_reward = 0 action = "" if np.random.uniform(0, 1) <= self.exp_rate: action = np.random.choice(self.actions) else: # greedy action for a in self.actions: current_position = self.grid_world.state nxt_reward = self.Q_values[current_position][a] if nxt_reward >= mx_nxt_reward: action = a mx_nxt_reward = nxt_reward # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action)) return action def takeAction(self, action): position = self.grid_world.nxtPosition(action) # update GridWorld return GridWorld(state=position) def reset(self): self.states = [] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd def play(self, rounds=10): i = 0 while i < rounds: # to the end of game back propagate reward if self.grid_world.isEnd: # back propagate reward = self.grid_world.giveReward() for a in self.actions: self.Q_values[self.grid_world.state][a] = reward print("Game End Reward", reward) for s in reversed(self.states): current_q_value = self.Q_values[s[0]][s[1]] reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value) self.Q_values[s[0]][s[1]] = round(reward, 3) self.reset() i += 1 else: action = self.chooseAction() # append trace self.states.append([(self.grid_world.state), action]) print("current position {} action {}".format(self.grid_world.state, action)) # by taking the action, it reaches the next state self.grid_world = self.takeAction(action) # mark is end self.grid_world.isEndFunc() print("nxt state", self.grid_world.state) print("---------------------") self.isEnd = self.grid_world.isEnd
class Agent: def __init__(self): self.states = [] # record position and action taken at the position self.actions = ["up", "down", "left", "right"] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd self.lr = 0.2 self.exp_rate = 0.3#-1#0.3#0.3#0.3 self.decay_gamma = 0.9 # initial Q values self.Q_values = {} for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.Q_values[(i, j)] = {} for a in self.actions: self.Q_values[(i, j)][a] = 0 # Q value is a dict of dict self.state_values_vec = np.zeros((len(self.Q_values))) self.policy_list = list() def chooseAction(self): # choose action with most expected value mx_nxt_reward = -np.inf action = "" if np.random.uniform(0, 1) <= self.exp_rate: action = np.random.choice(self.actions) else: # greedy action for a in self.actions: current_position = self.grid_world.state nxt_reward = self.Q_values[current_position][a] if nxt_reward >= mx_nxt_reward: action = a mx_nxt_reward = nxt_reward # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action)) return action def takeAction(self, action): position = self.grid_world.nxtPosition(action) # update GridWorld return GridWorld(state=position) def reset(self): self.states = [] self.grid_world = GridWorld() self.isEnd = self.grid_world.isEnd def play(self, rounds=10): i = 0 histories = list() data_sample = 0 while i < rounds: # to the end of game back propagate reward if self.grid_world.isEnd: # back propagate reward = self.grid_world.giveReward() for a in self.actions: self.Q_values[self.grid_world.state][a] = reward #print("Game End Reward", reward) next_state = (self.grid_world.state, 'right') for s in reversed(self.states): next_q_value = self.Q_values[next_state[0]][next_state[1]] current_q_value = self.Q_values[s[0]][s[1]] #reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value) reward = current_q_value + self.lr * (self.decay_gamma * next_q_value - current_q_value) self.Q_values[s[0]][s[1]] = round(reward, 3) next_state = s self.policy_list.append(self.make_policy_table_from_q()) self.reset() i += 1 aa = [[k for e, k in e.items()] for e in [v for k, v in self.Q_values.items()]] ab = np.array(aa, dtype=float) q_value_numpy = np.max(ab, axis=1, keepdims=False) diff = np.linalg.norm(self.state_values_vec[:] - q_value_numpy[:]) self.state_values_vec = q_value_numpy #print("iter {0}".format(i)) #print("diff {0}".format(diff)) #print("qvalue {0}".format(np.linalg.norm(q_value_numpy[:]))) print("{0}".format(data_sample)) value_scalar= np.linalg.norm(q_value_numpy[:]) histories.append((value_scalar, diff, int(data_sample))) data_sample = 0 else: action = self.chooseAction() # append trace self.states.append([(self.grid_world.state), action]) #s, a #print("current position {} action {}".format(self.grid_world.state, action)) # by taking the action, it reaches the next state self.grid_world = self.takeAction(action) # mark is end self.grid_world.isEndFunc() #print("nxt state", self.grid_world.state) #print("---------------------{0}".format(i)) self.isEnd = self.grid_world.isEnd data_sample += 1 return histories, self.policy_list def make_policy_table_from_q(self): convert_list = list() for k, v in self.Q_values.items(): convert_list.append(np.argmax(np.array([val for (key, val) in v.items()], float))) return convert_list