def get_reward(self, reward): """ When a reward is recieved, update the self.q_matrix, and then check for convergence. If training has not converged, run another iteration. """ # The actual Q Learning algorithm self.q_matrix[self.current_state][self.current_action] = self.q_matrix[ self.current_state][self.current_action] + self.alpha * ( reward.reward + self.gamma * max(self.q_matrix[self.next_state]) - self.q_matrix[self.current_state][self.current_action]) self.previous_matrices.append(copy.deepcopy(self.q_matrix)) self.current_state = self.next_state # Publish the matrix for grading matrix_msg = QMatrix() matrix_msg.q_matrix = [ QMatrixRow([int(y) for y in x]) for x in self.q_matrix.tolist() ] self.matrix_publisher.publish(matrix_msg) #check if q-matrix has converged #if not, train another iteration if not self.is_converged(reward.iteration_num): self.train_one_iteration() #if so, stop training, save q matrix to file, and exit else: rospy.logfatal("converged") rospy.loginfo(self.q_matrix) self.save_q_matrix() sys.exit(0)
def init_q_matrix(self): ''' initializes the q matrix with all zeros''' m = QMatrix() m.q_matrix = [] for _ in range(len(self.states)): row = QMatrixRow() row.q_matrix_row = [0] * len(self.actions) m.q_matrix.append(row) return m
def publish_qmatrix(self): matrix = QMatrix() matrix.q_matrix = [] for x in range(0, 64): row = self.q[x] matrixrow = QMatrixRow() matrixrow.q_matrix_row = row matrix.q_matrix.append(matrixrow) matrix.header = Header(stamp=rospy.Time.now()) self.matrix_pub.publish(matrix)
def publish_q_matrix(self): """ Converts self.q_matrix (a numpy array) into the form of a QMatrix object """ matrix = QMatrix() matrix.q_matrix = [] for q_row in self.q_matrix: row = QMatrixRow() row.q_matrix_row = q_row.astype(np.int16).tolist() matrix.q_matrix.append(row) self.q_pub.publish(matrix) filename = os.path.dirname(__file__) + "/saved_matrix/q_matrix.txt" np.savetxt(filename, self.q_matrix, delimiter=" ")
def publish_qmat(self): """ Converts self.qmat into a QMatrix() object and publishes it to /q_learning/q_matrix """ qmat_to_pub = QMatrix() qmat_to_pub.header = Header(stamp=rospy.Time.now()) rows = [] for row in self.qmat: qmat_row = QMatrixRow() qmat_row.q_matrix_row = list(row) rows.append(qmat_row) qmat_to_pub.q_matrix = rows self.qmat_pub.publish(qmat_to_pub)
def recieved_reward(self, data): # update new qmatrix self.qmatrix[self.curr_state][self.curr_action] = self.qmatrix[self.curr_state][self.curr_action] + self.alpha * (data.reward + self.gamma * max(self.qmatrix[self.next_state]) - self.qmatrix[self.curr_state][self.curr_action]) # create a new qmatrix ms type and translate our qmatrix into it new_qm = QMatrix() temp_qm = [] for row in self.qmatrix: temp_array = [] for val in row: temp_array.append(int(val)) temp_qm.append(QMatrixRow(temp_array)) new_qm.q_matrix = temp_qm # publish qmatrix self.qmatrix_pub.publish(new_qm) # calculate difference between current and previous qmatrix # set variables to store total diff between matrices percent_delta = 0 prev = 0 curr = 0 # iterate through prev and current qmatrices and calculate difference for i, row in np.ndenumerate(self.qmatrix): for j, val in np.ndenumerate(row): prev += int(self.previous_matrix[i][j]) curr += int(val) # if there is a meaningfull diff, modify our prev array if curr != 0: percent_delta = curr - prev if(data.reward > 0): self.prevs[self.index % 25] = percent_delta self.index += 1 # store this qmatrix in our previous_matrix variable self.previous_matrix = copy.deepcopy(self.qmatrix) # update our current state self.curr_state = self.next_state # calculate avg delta of the last 25 meaningful changes to decide whether matrix has converged or not avg_delta = np.mean(self.prevs) # if matrix hasn't converged, compute another iteration if (data.iteration_num < 300 or avg_delta > 1): self.run() # save matrix once it has converged else: self.save_q_matrix()
def get_reward(self, data): """ Calculate reward according to q learning algorithm. Check if q matrix has converged, and handle accordingly if so. """ # Calculate the max reward for next state next_reward = max(self.q_matrix[self.next_state]) # Calculate reward currently written in q matrix for our current state/action pair. current_reward = self.q_matrix[self.curr_state][self.curr_action] # Calculate a new reward using the q learning algorithm new_reward = current_reward + self.alpha * ( data.reward + self.gamma * next_reward - current_reward) # If newly calculated reward is the same as what's already written in the q matrix, # we count one instance of convergence. If the convergence count reaches 1000 instances, # we consider that our q matrix has converged and we save it to CSV. if current_reward == new_reward: self.convergence_count += 1 if self.convergence_count == self.convergence_max: self.save_q_matrix() return # If not converged, write the new reward in the q matrix and publish the updated q matrix. else: self.convergence_count = 0 self.q_matrix[self.curr_state][self.curr_action] = new_reward qm = QMatrix() temp = [] # potentially can cast rows as QMatrixRow more simply temp.append(row) for row in self.q_matrix: r = QMatrixRow() r.q_matrix_row = row temp.append(r) qm.header = Header(stamp=rospy.Time.now()) qm.q_matrix = temp self.matrix_pub.publish(qm) # Move to next state, and call another action to be taken. self.curr_state = self.next_state self.do_action()
def publish_qmatrix(self): matrix = QMatrix() matrix.header = Header(stamp=rospy.Time.now()) matrix.q_matrix = self.q self.matrix_pub.publish(matrix)