def learn(self, grid, next_grid, act): """ Performs the learning procudre. It is called after act() and sense() so you have access to the latest tuple (s, s', a, r). """ print("learning") # there is no need for next grid state = self.get_surrround_agent(grid) key_q_s_a = state + "_" + Action.toString(act) q_s_a = self.state_dict[key_q_s_a] self.constant = 0 next_state = self.get_surrround_agent(next_grid) if next_state[3] == '1' or next_state[2] == '1' or next_state[1] == '1': # pdb.set_trace() self.constant = -1 # find max action Q(s',a) based on policy (next state) max_act = self.get_max_action( next_state, self.state_dict) # get the maximum action key_qnext_s_a = next_state + '_' + Action.toString(max_act) qnext_s_a = self.state_dict[key_qnext_s_a] # if max_act==Action.BREAK: # self.current_reward -= 5 if max_act == Action.ACCELERATE: #and self.current_reward>0: self.constant = 0.5 q_s_a = q_s_a + self.alpha * (self.current_reward + (self.gamma * qnext_s_a) - q_s_a + self.constant) self.state_dict[key_q_s_a] = q_s_a next_max_act = self.get_act_from_policy( max_act) # get the maximum action print("Action - " + Action.toString(next_max_act) + " reward = " + str(self.current_reward)) return next_max_act
def callback(self, learn, episode, iteration): self.episode = episode """ Called at the end of each timestep for reporting/debugging purposes. """ if self.choose_random == False: print "{0}/{1}: {2} Action: {3} (Exploiting!)".format( episode, iteration, self.total_reward, Action.toString(self.action)) else: print "{0}/{1}: {2} Action: {3} (Exploring!)".format( episode, iteration, self.total_reward, Action.toString(self.action)) if iteration >= 6500: self.ep_rewards[episode - 1] = self.total_reward self.last_thetas = self.current_thetas + 0. self.current_thetas = self.thetas + 0. print 'Here' + str(np.sum(self.current_thetas)) + ' ' + str( np.sum(self.last_thetas)) if episode > 1: print 'Here' self.theta_conv[episode - 2] = self.converge_rate( self.current_thetas, self.last_thetas) print self.theta_conv[episode - 2] print self.thetas if episode == self.episodes: print "Rewards Mean: {0:.2f}, Reward Variance: {1:.2f}".format( np.mean(self.ep_rewards), np.var(self.ep_rewards)) if plotting: fig1 = plt.figure() plt.plot(np.arange(self.episodes) + 1, self.ep_rewards) plt.xlabel('episode') plt.ylabel('reward') plt.show() fig1.savefig('rewards.pdf', format='pdf') fig2 = plt.figure() plt.bar(np.arange(len(self.thetas)), self.thetas, width=1) plt.xlabel('index') plt.xticks(np.arange(len(self.thetas))) plt.ylabel('weight') plt.show() fig2.savefig('thetas.pdf', format='pdf') fig3 = plt.figure() plt.plot(np.arange(4, len(self.theta_conv) + 4), self.theta_conv) plt.xlabel('episode') plt.ylabel('rate') plt.show() fig3.savefig('theta_conv.pdf', format='pdf') np.save('rewards', self.ep_rewards) np.save('thetas', self.thetas) np.save('theta_conv', self.theta_conv)
def callback(self, learn, episode, iteration): print("{0}/{1}: {2}".format(episode, iteration, self.total_reward)) if self.choose_random == False: print("{0}/{1}: {2} Action: {3} (Exploiting!)".format( episode, iteration, self.total_reward, Action.toString(self.action))) else: print("{0}/{1}: {2} Action: {3} (Exploring!)".format( episode, iteration, self.total_reward, Action.toString(self.action))) if iteration >= 6500: self.ep_rewards[episode - 1] = self.total_reward self.last_Q = self.current_Q + 0. self.current_Q = self.Q.flatten() if episode > 1: self.Q_conv[episode - 2] = self.converge_rate( self.current_Q, self.last_Q) print(self.Q_conv[episode - 2]) if episode == self.episodes: print("Rewards Mean: {0:.2f}, Reward Variance: {1:.2f}".format( np.mean(self.ep_rewards), np.var(self.ep_rewards))) if plotting: fig1 = plt.figure() plt.plot(np.arange(self.episodes) + 1, self.ep_rewards) plt.xlabel('episode') plt.ylabel('reward') plt.show() fig1.savefig('rewards.pdf', format='pdf') fig2 = plt.figure() plt.bar(np.arange(len(self.Q)), self.Q, width=1) plt.xlabel('index') plt.xticks(np.arange(len(self.Q))) plt.ylabel('weight') plt.show() fig2.savefig('qs.pdf', format='pdf') fig3 = plt.figure() plt.plot(np.arange(2, len(self.Q_conv) + 2), self.Q_conv) plt.xlabel('episode') plt.ylabel('mean rate') plt.show() fig3.savefig('Q_conv.pdf', format='pdf') np.save('rewards', self.ep_rewards) np.save('Qs', self.current_Q) np.save('Q_conv', self.Q_conv)
def get_max_action(self, state, val_dict): possible_move = [] max_pol_s_a = -1000 max_a = None for act in self.getActionsSet(): key_s_a = state + '_' + Action.toString(act) pol_s_a = val_dict[key_s_a] if pol_s_a > max_pol_s_a: max_pol_s_a = pol_s_a max_a = act return max_a
def init_Q(self): _l = [0 for i in range(14)] q_grid = [] self.state_Q(q_grid, _l, 0) for grid in q_grid: for act in self.getActionsSet(): key = ''.join(grid) + '_' + Action.toString(act) self.state_dict[key] = 1.0 / len(q_grid) if act == Action.ACCELERATE: self.policy_s_a[key] = 1.0 / len(self.getActionsSet()) else: self.policy_s_a[key] = 1.0 / len(self.getActionsSet())
def learn(self, grid, next_grid, act): """ Performs the learning procudre. It is called after act() and sense() so you have access to the latest tuple (s, s', a, r). """ print("learning") # there is no need for next grid state = self.get_surrround_agent(grid) key_q_s_a = state + "_" + Action.toString(act) q_s_a = self.state_dict[key_q_s_a] next_state = self.get_surrround_agent(next_grid) # find max action Q(s',a) based on policy (next state) max_act = self.get_max_action( next_state, self.state_dict) # get the maximum action key_qnext_s_a = next_state + '_' + Action.toString(max_act) qnext_s_a = self.state_dict[key_qnext_s_a] q_s_a = q_s_a + self.alpha * (self.current_reward + (self.gamma * qnext_s_a) - q_s_a) self.state_dict[key_q_s_a] = q_s_a next_max_act = self.get_act_from_policy( max_act) # get the maximum action return next_max_act
def get_max_action(self, state, val_dict): possible_move = [] max_pol_s_a = -1000 max_a = None for act in self.getActionsSet(): key_s_a = state + '_' + Action.toString(act) encourage_val = 0 if self.encourage == act: encourage_val = val_dict[key_s_a] * 2 pol_s_a = val_dict[key_s_a] + encourage_val if pol_s_a > max_pol_s_a: max_pol_s_a = pol_s_a max_a = act return max_a