Exemple #1
0
    def learn(self, grid, next_grid, act):
        """ Performs the learning procudre. It is called after act() and
        sense() so you have access to the latest tuple (s, s', a, r).
        """
        print("learning")
        # there is no need for next grid
        state = self.get_surrround_agent(grid)
        key_q_s_a = state + "_" + Action.toString(act)
        q_s_a = self.state_dict[key_q_s_a]

        self.constant = 0
        next_state = self.get_surrround_agent(next_grid)
        if next_state[3] == '1' or next_state[2] == '1' or next_state[1] == '1':
            # pdb.set_trace()
            self.constant = -1
        # find max action Q(s',a) based on policy (next state)
        max_act = self.get_max_action(
            next_state, self.state_dict)  # get the maximum action
        key_qnext_s_a = next_state + '_' + Action.toString(max_act)
        qnext_s_a = self.state_dict[key_qnext_s_a]
        # if max_act==Action.BREAK:
        #     self.current_reward -= 5
        if max_act == Action.ACCELERATE:  #and self.current_reward>0:
            self.constant = 0.5

        q_s_a = q_s_a + self.alpha * (self.current_reward +
                                      (self.gamma * qnext_s_a) - q_s_a +
                                      self.constant)
        self.state_dict[key_q_s_a] = q_s_a

        next_max_act = self.get_act_from_policy(
            max_act)  # get the maximum action
        print("Action - " + Action.toString(next_max_act) + " reward = " +
              str(self.current_reward))
        return next_max_act
    def callback(self, learn, episode, iteration):
        self.episode = episode
        """ Called at the end of each timestep for reporting/debugging purposes.
        """
        if self.choose_random == False:
            print "{0}/{1}: {2} Action: {3} (Exploiting!)".format(
                episode, iteration, self.total_reward,
                Action.toString(self.action))
        else:
            print "{0}/{1}: {2} Action: {3} (Exploring!)".format(
                episode, iteration, self.total_reward,
                Action.toString(self.action))
        if iteration >= 6500:
            self.ep_rewards[episode - 1] = self.total_reward
            self.last_thetas = self.current_thetas + 0.
            self.current_thetas = self.thetas + 0.
            print 'Here' + str(np.sum(self.current_thetas)) + '  ' + str(
                np.sum(self.last_thetas))
            if episode > 1:
                print 'Here'
                self.theta_conv[episode - 2] = self.converge_rate(
                    self.current_thetas, self.last_thetas)
                print self.theta_conv[episode - 2]
            print self.thetas
            if episode == self.episodes:
                print "Rewards Mean: {0:.2f}, Reward Variance: {1:.2f}".format(
                    np.mean(self.ep_rewards), np.var(self.ep_rewards))
                if plotting:
                    fig1 = plt.figure()
                    plt.plot(np.arange(self.episodes) + 1, self.ep_rewards)
                    plt.xlabel('episode')
                    plt.ylabel('reward')
                    plt.show()
                    fig1.savefig('rewards.pdf', format='pdf')

                    fig2 = plt.figure()
                    plt.bar(np.arange(len(self.thetas)), self.thetas, width=1)
                    plt.xlabel('index')
                    plt.xticks(np.arange(len(self.thetas)))
                    plt.ylabel('weight')
                    plt.show()
                    fig2.savefig('thetas.pdf', format='pdf')

                    fig3 = plt.figure()
                    plt.plot(np.arange(4,
                                       len(self.theta_conv) + 4),
                             self.theta_conv)
                    plt.xlabel('episode')
                    plt.ylabel('rate')
                    plt.show()
                    fig3.savefig('theta_conv.pdf', format='pdf')

                np.save('rewards', self.ep_rewards)
                np.save('thetas', self.thetas)
                np.save('theta_conv', self.theta_conv)
Exemple #3
0
    def callback(self, learn, episode, iteration):
        print("{0}/{1}: {2}".format(episode, iteration, self.total_reward))
        if self.choose_random == False:
            print("{0}/{1}: {2} Action: {3} (Exploiting!)".format(
                episode, iteration, self.total_reward,
                Action.toString(self.action)))
        else:
            print("{0}/{1}: {2} Action: {3} (Exploring!)".format(
                episode, iteration, self.total_reward,
                Action.toString(self.action)))
        if iteration >= 6500:
            self.ep_rewards[episode - 1] = self.total_reward
            self.last_Q = self.current_Q + 0.
            self.current_Q = self.Q.flatten()
            if episode > 1:
                self.Q_conv[episode - 2] = self.converge_rate(
                    self.current_Q, self.last_Q)
                print(self.Q_conv[episode - 2])
            if episode == self.episodes:
                print("Rewards Mean: {0:.2f}, Reward Variance: {1:.2f}".format(
                    np.mean(self.ep_rewards), np.var(self.ep_rewards)))
                if plotting:
                    fig1 = plt.figure()
                    plt.plot(np.arange(self.episodes) + 1, self.ep_rewards)
                    plt.xlabel('episode')
                    plt.ylabel('reward')
                    plt.show()
                    fig1.savefig('rewards.pdf', format='pdf')

                    fig2 = plt.figure()
                    plt.bar(np.arange(len(self.Q)), self.Q, width=1)
                    plt.xlabel('index')
                    plt.xticks(np.arange(len(self.Q)))
                    plt.ylabel('weight')
                    plt.show()
                    fig2.savefig('qs.pdf', format='pdf')

                    fig3 = plt.figure()
                    plt.plot(np.arange(2, len(self.Q_conv) + 2), self.Q_conv)
                    plt.xlabel('episode')
                    plt.ylabel('mean rate')
                    plt.show()
                    fig3.savefig('Q_conv.pdf', format='pdf')

                np.save('rewards', self.ep_rewards)
                np.save('Qs', self.current_Q)
                np.save('Q_conv', self.Q_conv)
Exemple #4
0
 def get_max_action(self, state, val_dict):
     possible_move = []
     max_pol_s_a = -1000
     max_a = None
     for act in self.getActionsSet():
         key_s_a = state + '_' + Action.toString(act)
         pol_s_a = val_dict[key_s_a]
         if pol_s_a > max_pol_s_a:
             max_pol_s_a = pol_s_a
             max_a = act
     return max_a
 def init_Q(self):
     _l = [0 for i in range(14)]
     q_grid = []
     self.state_Q(q_grid, _l, 0)
     for grid in q_grid:
         for act in self.getActionsSet():
             key = ''.join(grid) + '_' + Action.toString(act)
             self.state_dict[key] = 1.0 / len(q_grid)
             if act == Action.ACCELERATE:
                 self.policy_s_a[key] = 1.0 / len(self.getActionsSet())
             else:
                 self.policy_s_a[key] = 1.0 / len(self.getActionsSet())
    def learn(self, grid, next_grid, act):
        """ Performs the learning procudre. It is called after act() and
        sense() so you have access to the latest tuple (s, s', a, r).
        """
        print("learning")
        # there is no need for next grid
        state = self.get_surrround_agent(grid)
        key_q_s_a = state + "_" + Action.toString(act)
        q_s_a = self.state_dict[key_q_s_a]

        next_state = self.get_surrround_agent(next_grid)
        # find max action Q(s',a) based on policy (next state)
        max_act = self.get_max_action(
            next_state, self.state_dict)  # get the maximum action
        key_qnext_s_a = next_state + '_' + Action.toString(max_act)
        qnext_s_a = self.state_dict[key_qnext_s_a]
        q_s_a = q_s_a + self.alpha * (self.current_reward +
                                      (self.gamma * qnext_s_a) - q_s_a)
        self.state_dict[key_q_s_a] = q_s_a
        next_max_act = self.get_act_from_policy(
            max_act)  # get the maximum action
        return next_max_act
    def get_max_action(self, state, val_dict):
        possible_move = []
        max_pol_s_a = -1000
        max_a = None
        for act in self.getActionsSet():
            key_s_a = state + '_' + Action.toString(act)
            encourage_val = 0
            if self.encourage == act:
                encourage_val = val_dict[key_s_a] * 2
            pol_s_a = val_dict[key_s_a] + encourage_val

            if pol_s_a > max_pol_s_a:
                max_pol_s_a = pol_s_a
                max_a = act
        return max_a