def get_reward(self, reward):
        """
        When a reward is recieved, update the self.q_matrix,
        and then check for convergence. If training has not
        converged, run another iteration.
        """

        # The actual Q Learning algorithm
        self.q_matrix[self.current_state][self.current_action] = self.q_matrix[
            self.current_state][self.current_action] + self.alpha * (
                reward.reward +
                self.gamma * max(self.q_matrix[self.next_state]) -
                self.q_matrix[self.current_state][self.current_action])
        self.previous_matrices.append(copy.deepcopy(self.q_matrix))
        self.current_state = self.next_state

        # Publish the matrix for grading
        matrix_msg = QMatrix()
        matrix_msg.q_matrix = [
            QMatrixRow([int(y) for y in x]) for x in self.q_matrix.tolist()
        ]
        self.matrix_publisher.publish(matrix_msg)

        #check if q-matrix has converged

        #if not, train another iteration
        if not self.is_converged(reward.iteration_num):
            self.train_one_iteration()
        #if so, stop training, save q matrix to file, and exit
        else:
            rospy.logfatal("converged")
            rospy.loginfo(self.q_matrix)
            self.save_q_matrix()
            sys.exit(0)
Exemple #2
0
 def init_q_matrix(self):
     ''' initializes the q matrix with all zeros'''
     m = QMatrix()
     m.q_matrix = []
     for _ in range(len(self.states)):
         row = QMatrixRow()
         row.q_matrix_row = [0] * len(self.actions)
         m.q_matrix.append(row)
     return m
Exemple #3
0
 def publish_qmatrix(self):
     matrix = QMatrix()
     matrix.q_matrix = []
     for x in range(0, 64):
         row = self.q[x]
         matrixrow = QMatrixRow()
         matrixrow.q_matrix_row = row
         matrix.q_matrix.append(matrixrow)
     matrix.header = Header(stamp=rospy.Time.now())
     self.matrix_pub.publish(matrix)
    def publish_q_matrix(self):
        """ Converts self.q_matrix (a numpy array)
            into the form of a QMatrix object
        """
        matrix = QMatrix()
        matrix.q_matrix = []
        for q_row in self.q_matrix:
            row = QMatrixRow()
            row.q_matrix_row = q_row.astype(np.int16).tolist()
            matrix.q_matrix.append(row)
        self.q_pub.publish(matrix)

        filename = os.path.dirname(__file__) + "/saved_matrix/q_matrix.txt"
        np.savetxt(filename, self.q_matrix, delimiter=" ")
Exemple #5
0
    def publish_qmat(self):
        """
        Converts self.qmat into a QMatrix() object and 
        publishes it to /q_learning/q_matrix
        """
        qmat_to_pub = QMatrix()
        qmat_to_pub.header = Header(stamp=rospy.Time.now())
        rows = []
        for row in self.qmat:
            qmat_row = QMatrixRow()
            qmat_row.q_matrix_row = list(row)
            rows.append(qmat_row)

        qmat_to_pub.q_matrix = rows
        self.qmat_pub.publish(qmat_to_pub)
    def recieved_reward(self, data):
        
        # update new qmatrix
        self.qmatrix[self.curr_state][self.curr_action] = self.qmatrix[self.curr_state][self.curr_action] + self.alpha * (data.reward + self.gamma * max(self.qmatrix[self.next_state]) - self.qmatrix[self.curr_state][self.curr_action])
        
        # create a new qmatrix ms type and translate our qmatrix into it
        new_qm = QMatrix()
        temp_qm = []
        for row in self.qmatrix:
            temp_array = []
            for val in row:
                temp_array.append(int(val))
            temp_qm.append(QMatrixRow(temp_array))
        new_qm.q_matrix = temp_qm
        # publish qmatrix
        self.qmatrix_pub.publish(new_qm)

        # calculate difference between current and previous qmatrix
        # set variables to store total diff between matrices
        percent_delta = 0
        prev = 0
        curr = 0
        # iterate through prev and current qmatrices and calculate difference
        for i, row in np.ndenumerate(self.qmatrix):
            for j, val in np.ndenumerate(row):
                prev += int(self.previous_matrix[i][j])
                curr += int(val)
        # if there is a meaningfull diff, modify our prev array
        if curr != 0:
            percent_delta = curr - prev
            if(data.reward > 0):
                self.prevs[self.index % 25] = percent_delta
                self.index += 1
                 
                # store this qmatrix in our previous_matrix variable
                self.previous_matrix = copy.deepcopy(self.qmatrix)

        # update our current state
        self.curr_state = self.next_state

        # calculate avg delta of the last 25 meaningful changes to decide whether matrix has converged or not
        avg_delta = np.mean(self.prevs)
        # if matrix hasn't converged, compute another iteration
        if (data.iteration_num < 300 or avg_delta > 1): 
            self.run()
        # save matrix once it has converged
        else:
            self.save_q_matrix()        
    def get_reward(self, data):
        """
        Calculate reward according to q learning algorithm. Check if q matrix has converged,
        and handle accordingly if so.
        """
        # Calculate the max reward for next state
        next_reward = max(self.q_matrix[self.next_state])

        # Calculate reward currently written in q matrix for our current state/action pair.
        current_reward = self.q_matrix[self.curr_state][self.curr_action]

        # Calculate a new reward using the q learning algorithm
        new_reward = current_reward + self.alpha * (
            data.reward + self.gamma * next_reward - current_reward)

        # If newly calculated reward is the same as what's already written in the q matrix,
        # we count one instance of convergence. If the convergence count reaches 1000 instances,
        # we consider that our q matrix has converged and we save it to CSV.
        if current_reward == new_reward:
            self.convergence_count += 1
            if self.convergence_count == self.convergence_max:
                self.save_q_matrix()
                return
        # If not converged, write the new reward in the q matrix and publish the updated q matrix.
        else:
            self.convergence_count = 0
            self.q_matrix[self.curr_state][self.curr_action] = new_reward

            qm = QMatrix()
            temp = []
            # potentially can cast rows as QMatrixRow more simply temp.append(row)
            for row in self.q_matrix:
                r = QMatrixRow()
                r.q_matrix_row = row
                temp.append(r)

            qm.header = Header(stamp=rospy.Time.now())
            qm.q_matrix = temp
            self.matrix_pub.publish(qm)

        # Move to next state, and call another action to be taken.
        self.curr_state = self.next_state
        self.do_action()
 def publish_qmatrix(self):
     matrix = QMatrix()
     matrix.header = Header(stamp=rospy.Time.now())
     matrix.q_matrix = self.q
     self.matrix_pub.publish(matrix)