def get_reward(self, reward):
        """
        When a reward is recieved, update the self.q_matrix,
        and then check for convergence. If training has not
        converged, run another iteration.
        """

        # The actual Q Learning algorithm
        self.q_matrix[self.current_state][self.current_action] = self.q_matrix[
            self.current_state][self.current_action] + self.alpha * (
                reward.reward +
                self.gamma * max(self.q_matrix[self.next_state]) -
                self.q_matrix[self.current_state][self.current_action])
        self.previous_matrices.append(copy.deepcopy(self.q_matrix))
        self.current_state = self.next_state

        # Publish the matrix for grading
        matrix_msg = QMatrix()
        matrix_msg.q_matrix = [
            QMatrixRow([int(y) for y in x]) for x in self.q_matrix.tolist()
        ]
        self.matrix_publisher.publish(matrix_msg)

        #check if q-matrix has converged

        #if not, train another iteration
        if not self.is_converged(reward.iteration_num):
            self.train_one_iteration()
        #if so, stop training, save q matrix to file, and exit
        else:
            rospy.logfatal("converged")
            rospy.loginfo(self.q_matrix)
            self.save_q_matrix()
            sys.exit(0)
Exemple #2
0
 def init_q_matrix(self):
     ''' initializes the q matrix with all zeros'''
     m = QMatrix()
     m.q_matrix = []
     for _ in range(len(self.states)):
         row = QMatrixRow()
         row.q_matrix_row = [0] * len(self.actions)
         m.q_matrix.append(row)
     return m
Exemple #3
0
 def publish_qmatrix(self):
     matrix = QMatrix()
     matrix.q_matrix = []
     for x in range(0, 64):
         row = self.q[x]
         matrixrow = QMatrixRow()
         matrixrow.q_matrix_row = row
         matrix.q_matrix.append(matrixrow)
     matrix.header = Header(stamp=rospy.Time.now())
     self.matrix_pub.publish(matrix)
    def publish_q_matrix(self):
        """ Converts self.q_matrix (a numpy array)
            into the form of a QMatrix object
        """
        matrix = QMatrix()
        matrix.q_matrix = []
        for q_row in self.q_matrix:
            row = QMatrixRow()
            row.q_matrix_row = q_row.astype(np.int16).tolist()
            matrix.q_matrix.append(row)
        self.q_pub.publish(matrix)

        filename = os.path.dirname(__file__) + "/saved_matrix/q_matrix.txt"
        np.savetxt(filename, self.q_matrix, delimiter=" ")
Exemple #5
0
    def publish_qmat(self):
        """
        Converts self.qmat into a QMatrix() object and 
        publishes it to /q_learning/q_matrix
        """
        qmat_to_pub = QMatrix()
        qmat_to_pub.header = Header(stamp=rospy.Time.now())
        rows = []
        for row in self.qmat:
            qmat_row = QMatrixRow()
            qmat_row.q_matrix_row = list(row)
            rows.append(qmat_row)

        qmat_to_pub.q_matrix = rows
        self.qmat_pub.publish(qmat_to_pub)
    def recieved_reward(self, data):
        
        # update new qmatrix
        self.qmatrix[self.curr_state][self.curr_action] = self.qmatrix[self.curr_state][self.curr_action] + self.alpha * (data.reward + self.gamma * max(self.qmatrix[self.next_state]) - self.qmatrix[self.curr_state][self.curr_action])
        
        # create a new qmatrix ms type and translate our qmatrix into it
        new_qm = QMatrix()
        temp_qm = []
        for row in self.qmatrix:
            temp_array = []
            for val in row:
                temp_array.append(int(val))
            temp_qm.append(QMatrixRow(temp_array))
        new_qm.q_matrix = temp_qm
        # publish qmatrix
        self.qmatrix_pub.publish(new_qm)

        # calculate difference between current and previous qmatrix
        # set variables to store total diff between matrices
        percent_delta = 0
        prev = 0
        curr = 0
        # iterate through prev and current qmatrices and calculate difference
        for i, row in np.ndenumerate(self.qmatrix):
            for j, val in np.ndenumerate(row):
                prev += int(self.previous_matrix[i][j])
                curr += int(val)
        # if there is a meaningfull diff, modify our prev array
        if curr != 0:
            percent_delta = curr - prev
            if(data.reward > 0):
                self.prevs[self.index % 25] = percent_delta
                self.index += 1
                 
                # store this qmatrix in our previous_matrix variable
                self.previous_matrix = copy.deepcopy(self.qmatrix)

        # update our current state
        self.curr_state = self.next_state

        # calculate avg delta of the last 25 meaningful changes to decide whether matrix has converged or not
        avg_delta = np.mean(self.prevs)
        # if matrix hasn't converged, compute another iteration
        if (data.iteration_num < 300 or avg_delta > 1): 
            self.run()
        # save matrix once it has converged
        else:
            self.save_q_matrix()        
    def convert_send_qmatrix_msg(self):  # Converts numpy array to QMatrix msg
        self.q_matrix_msg = QMatrix()
        for i in range(len(self.q_matrix_arr)):
            row = QMatrixRow()
            row.q_matrix_row = self.q_matrix_arr[i].astype(int)
            self.q_matrix_msg.q_matrix.append(row)

        # Publish Q-matrix message to Q-matrix topic
        self.q_matrix_pub.publish(self.q_matrix_msg)
    def get_reward(self, data):
        """
        Calculate reward according to q learning algorithm. Check if q matrix has converged,
        and handle accordingly if so.
        """
        # Calculate the max reward for next state
        next_reward = max(self.q_matrix[self.next_state])

        # Calculate reward currently written in q matrix for our current state/action pair.
        current_reward = self.q_matrix[self.curr_state][self.curr_action]

        # Calculate a new reward using the q learning algorithm
        new_reward = current_reward + self.alpha * (
            data.reward + self.gamma * next_reward - current_reward)

        # If newly calculated reward is the same as what's already written in the q matrix,
        # we count one instance of convergence. If the convergence count reaches 1000 instances,
        # we consider that our q matrix has converged and we save it to CSV.
        if current_reward == new_reward:
            self.convergence_count += 1
            if self.convergence_count == self.convergence_max:
                self.save_q_matrix()
                return
        # If not converged, write the new reward in the q matrix and publish the updated q matrix.
        else:
            self.convergence_count = 0
            self.q_matrix[self.curr_state][self.curr_action] = new_reward

            qm = QMatrix()
            temp = []
            # potentially can cast rows as QMatrixRow more simply temp.append(row)
            for row in self.q_matrix:
                r = QMatrixRow()
                r.q_matrix_row = row
                temp.append(r)

            qm.header = Header(stamp=rospy.Time.now())
            qm.q_matrix = temp
            self.matrix_pub.publish(qm)

        # Move to next state, and call another action to be taken.
        self.curr_state = self.next_state
        self.do_action()
    def __init__(self):
        self.initialized = False
        # initialize this node
        rospy.init_node('q_learning', disable_signals=True)

        # ROS publish to robot action so that phantom movement can occur
        self.robot_action_pub = rospy.Publisher("/q_learning/robot_action",
                                                RobotMoveDBToBlock,
                                                queue_size=10)

        # ROS publish to the Gazebo topic to set the locations of the models
        self.model_states_pub = rospy.Publisher("/gazebo/set_model_state",
                                                ModelState,
                                                queue_size=10)

        # ROS publish to Q Matrix to update Q-Learning Matrix
        self.q_matrix_pub = rospy.Publisher("/q_learning/q_matrix",
                                            QMatrix,
                                            queue_size=10)

        #ROS subscribe to Rewards to receive from the environment the reward after each action you take
        rospy.Subscriber("/q_learning/reward", QLearningReward,
                         self.processReward)

        # intialize Q matrix, reward, minimum number of iterations to do before
        # checking convergence, current iteraton, and convergence
        self.q_matrix = QMatrix()
        self.initialize_q_matrix()
        self.reward = None
        self.min_iterations = 150
        self.num_iterations_eps = 0
        self.current_iteration = 1
        self.num_rewards = 0
        self.num_actions_pub = 0
        self.converged = False
        self.past_iteration = -1
        self.num_iterations = 0

        self.final_actions = []

        # intialize current state, next state, and action taken
        self.current_state = 0
        self.next_state = -1
        self.action = -1

        # Create action matrix
        self.action_matrix = [[] for x in range(64)]
        for i in range(64):
            self.action_matrix[i] = [0 for x in range(64)]

        # Initialize acton matrix
        self.initialize_action_matrix()

        self.initialized = True
    def __init__(self):
        # initialize node
        rospy.init_node('q_learning')

        # publish to /q_learning/q_matrix with message type q_learning_QMatrix each time Q matrix is updated
        self.q_matrix_pub = rospy.Publisher("/q_learning/q_matrix",
                                            QMatrix,
                                            queue_size=10)

        # publish to robot_action
        self.robot_action_pub = rospy.Publisher("/q_learning/robot_action",
                                                RobotMoveDBToBlock,
                                                queue_size=10)

        # subscribe to /q_learning_reward
        rospy.Subscriber("/q_learning/reward", QLearningReward,
                         self.get_reward)

        self.action = RobotMoveDBToBlock()

        # get all possible actions and states
        self.all_actions = all_actions()  # a list
        self.all_states = all_states()  # a list

        # create action matrix and condense
        self.action_matrix = create_action_matrix(self.all_states,
                                                  self.all_actions)
        self.possible_actions = get_possible_actions(
            self.action_matrix)  # array of numpy arrays

        self.reward = 0
        self.initialized = False

        #initialize q_matrix
        self.q_matrix = QMatrix()
        self.initialize_q_matrix()

        self.current_q_matrix = None
        self.converged = False

        self.initialized = True

        self.converge_q_matrix()
        self.converged = True

        self.action_seq = []
        self.get_action_sequence()
    def initialize_q_matrix(self):
        h = Header()
        h.stamp = rospy.Time.now()
        h.frame_id = "q_matrix"
        temp = QMatrixRow()
        temp.q_matrix_row = [0] * 9

        self.actual_q_matrix = QMatrix()
        self.actual_q_matrix.header = h
        # 64 different states = 64 rows, with 9 possible actions each (columns)
        self.actual_q_matrix.q_matrix = [temp] * 64

        # This variable is used to determine whether the matrix has converged
        self.prev_q_matrix = copy.deepcopy(self.actual_q_matrix)

        # Publish the initial, empty q_matrix to the topic
        self.q_matrix_pub.publish(self.actual_q_matrix)
    def __init__(self):
        # Publish Q-matrix updates to Q-matrix topic
        self.q_matrix_pub = rospy.Publisher("/q_learning/QMatrix",
                                            QMatrix,
                                            queue_size=10)

        # Publish to /robot_action for phantom robot movement
        self.q_learning = QLearning()
        self.action_pub = rospy.Publisher("/q_learning/robot_action",
                                          RobotMoveDBToBlock,
                                          queue_size=10)

        # Subscribe to environment to receive reward updates
        self.reward = rospy.Subscriber("/q_learning/reward", QLearningReward,
                                       self.update_q_matrix)
        rospy.sleep(2)

        # Initialize Q-matrix
        self.q_matrix_msg = QMatrix()

        # For ease of use in training, we'll store the Q-matrix in a numpy array
        self.q_matrix_arr = np.zeros((64, 9))

        # Keep track of the 5 most recently updated Q-values
        self.q_history = []

        # Keeps track of current (starting) state, initialized to 0th state
        self.current_state = 0

        # Get new state and action from starting state
        new_state, action = self.get_random_action(self.current_state)

        # Current action is action from above (action from current state to new state)
        self.current_action = action

        # Keep track of new state
        self.new_state = new_state

        # Move the robot according to the current action
        self.move_DB(self.current_action
                     )  # update_q_matrix() will be called as callback
    def __init__(self):

        # Once everything is set up this will be set to true
        self.initialized = False

        # Initialize this node
        rospy.init_node("q_learning")

        # Set up publishers
        self.q_matrix_pub = rospy.Publisher("/q_learning/q_matrix",
                                            QMatrix,
                                            queue_size=10)
        self.robot_action_pub = rospy.Publisher("/q_learning/robot_action",
                                                RobotMoveDBToBlock,
                                                queue_size=10)

        self.cnt = 0

        # Set up subscriber
        rospy.Subscriber("/q_learning/reward", QLearningReward,
                         self.reward_received)

        # Fetch pre-built action matrix. This is a 2d numpy array where row indexes
        # correspond to the starting state and column indexes are the next states.
        #
        # A value of -1 indicates that it is not possible to get to the next state
        # from the starting state. Values 0-9 correspond to what action is needed
        # to go to the next state.
        #
        # e.g. self.action_matrix[0][12] = 5
        self.action_matrix = np.loadtxt(path_prefix + "action_matrix.txt")

        # Fetch actions. These are the only 9 possible actions the system can take.
        # self.actions is an array of dictionaries where the row index corresponds
        # to the action number, and the value has the following form:
        # { dumbbell: "red", block: 1}
        colors = ["red", "green", "blue"]
        self.actions = np.loadtxt(path_prefix + "actions.txt")
        self.actions = list(
            map(lambda x: {
                "dumbbell": colors[int(x[0])],
                "block": int(x[1])
            }, self.actions))

        # Fetch states. There are 64 states. Each row index corresponds to the
        # state number, and the value is a list of 3 items indicating the positions
        # of the red, green, blue dumbbells respectively.
        # e.g. [[0, 0, 0], [1, 0 , 0], [2, 0, 0], ..., [3, 3, 3]]
        # e.g. [0, 1, 2] indicates that the green dumbbell is at block 1, and blue at block 2.
        # A value of 0 corresponds to the origin. 1/2/3 corresponds to the block number.
        # Note: that not all states are possible to get to.
        self.states = np.loadtxt(path_prefix + "states.txt")
        self.states = list(
            map(lambda x: list(map(lambda y: int(y), x)), self.states))

        # Initialize current state and keep track of the next state
        self.curr_state = 0
        self.next_state = 0

        # Initialize current action index
        self.curr_action = 0

        # Initialize variables to define static status and keep track of how many
        #   iterations have the Q-matrix remained static
        self.epsilon = 1
        self.static_iter_threshold = 500
        self.static_tracker = 0

        # Initialize and publish Q-matrix
        self.q_matrix = QMatrix()
        self.initialize_q_matrix()
        self.q_matrix_pub.publish(self.q_matrix)

        # Now everything is initialized, sleep for 1 second to make sure
        self.initialized = True
        rospy.sleep(1)

        # Start with a random action
        self.select_random_action()
 def publish_qmatrix(self):
     matrix = QMatrix()
     matrix.header = Header(stamp=rospy.Time.now())
     matrix.q_matrix = self.q
     self.matrix_pub.publish(matrix)
Exemple #15
0
    def train_q_matrix(self):
        # initialize q_matrix
        self.q_matrix = [[0 for i in range(len(self.actions))]
                         for j in range(len(self.states))]
        self.last_checked_q_matrix = [[0 for i in range(len(self.actions))]
                                      for j in range(len(self.states))]

        current_state_num = 0
        self.reward_update = 0
        self.state_update = 0
        q_matrix_difference = 100
        iteration = 0
        while q_matrix_difference > 10:
            # training process
            # generate a random action

            allowed_actions = []
            for state in range(len(self.states)):
                action_num = self.action_matrix[current_state_num][state]
                if action_num >= 0:
                    allowed_actions.append((action_num, state))
            if len(allowed_actions) > 0:
                action_state_pair = allowed_actions[int(
                    np.random.randint(0, len(allowed_actions)))]
                chosen_action_num = int(action_state_pair[0])
                action_dict = self.actions[chosen_action_num]
                action_msg = RobotMoveDBToBlock()
                action_msg.robot_db = action_dict['dumbbell']
                action_msg.block_id = action_dict['block']
                self.action_pub.publish(action_msg)
                # wait until we get reward and update accordingly
                while self.reward_update == 0:
                    time.sleep(1)
                # reset check for reward callback
                self.reward_update = 0

                next_state_num = action_state_pair[1]
                best_q_value = 0
                for action_q_value in self.q_matrix[next_state_num]:
                    if action_q_value >= best_q_value:
                        best_q_value = action_q_value
                q_matrix_adjustment = self.reward + (
                    self.discount_factor * best_q_value
                ) - self.q_matrix[current_state_num][chosen_action_num]
                self.q_matrix[current_state_num][chosen_action_num] += int(
                    self.learning_rate * q_matrix_adjustment)

                # publish new q_matrix
                self.q_matrix_msg = QMatrix()
                for state_row in self.q_matrix:
                    q_row = QMatrixRow()
                    for q_value in state_row:
                        q_row.q_matrix_row.append(q_value)
                    self.q_matrix_msg.q_matrix.append(q_row)
                self.q_matrix_pub.publish(self.q_matrix_msg)
                current_state_num = next_state_num

                iteration += 1
                print("Training iteration:", iteration)
                if (iteration % (len(self.states) * len(self.actions))) == 0:
                    q_matrix_difference = 0
                    for s_index, state in enumerate(self.q_matrix):
                        for a_index, value in enumerate(state):
                            q_matrix_difference += abs(
                                value -
                                self.last_checked_q_matrix[s_index][a_index])
                    self.last_checked_q_matrix = copy.deepcopy(self.q_matrix)
                    print("New q_matrix_difference:", q_matrix_difference)
            else:  # no actions left
                current_state_num = 0