def reset(self): """Prepares the Agent for a new episode. """ log_and_display('Initializing episode') self.env.environment_reset() self.current_state_id = self.env.get_current_state() self.total_explorations = 0
def execute_action(self, action_id): action = self.env.actions[action_id] if action[0] == self.env.action_type1: log_and_display('Action: Moving claw ' + str(action[1])) return self.env.move_arm(action[1], action_id) elif action[0] == self.env.action_type2: log_and_display('Action: Engaging/Disengaging claw ' + str(action[1])) return self.env.enable_grip(action[1], action_id)
def update_q_table(self, state, action, reward, state_new): """Routing to update q-table. """ q_current = self.q_table[state, action] error = reward + self.discount * np.max(self.q_table[state_new]) - q_current self.q_table[state, action] = q_current + self.learn_rate * error msg = "Q-Value: S:{}, A:{}, R:{}, S`:{}, TE: {}, Q:{}, Q`:{}".format(state, action, reward, state_new, error, q_current, self.q_table[state, action]) log_and_display(msg)
def select_action(self, current_state_id): """This method returns an action based on current state. It returns a mix of exploratory and exploitative actions based on epsilon value. """ if np.random.uniform() < self.epsilon: log_and_display('Exploring...') self.total_explorations += 1 action_id = np.random.choice(self.env.total_actions) else: log_and_display('Exploiting...') action_id = np.argmax(self.q_table[current_state_id]) return action_id
def pre_populate_qtable(self): """Not used, but can be invoked to pre-populate the Q table with good known values """ log_and_display("Pre populating the Q Table with some known values - this helps in converging faster") for index, val in enumerate(self.q_table): if not self.env.states[index][3]: # If object not held self.q_table[index][0] += config.REWARD_FIRST_SUCCESS # Encourage grip enable self.q_table[index][1] -= config.REWARD_BAD_STEP if self.env.states[index][6] > 0.03: for act in range(self.env.total_actions): if self.env.actions[act][0] == self.env.action_type1 and self.env.actions[act][1][2] > 0.03: self.q_table[index][act] = -100 else: self.q_table[index][1] += config.REWARD_FIRST_SUCCESS self.q_table[index][0] -= config.REWARD_BAD_STEP if self.env.states[index][6] <= 0.07: for act in range(self.env.total_actions): if self.env.actions[act][0] == self.env.action_type1 and self.env.actions[act][1][2] <= 0.07: self.q_table[index][act] = -100 log_and_display("Done")
def __get_canonical_state(self): """Fetches position of the arm, the object and state of the gripper, calculates the state id that their values correspond to, and returns the state id """ pos_obj = self.robot.get_position(self.robot.cylinder_handle) pos_arm = self.robot.get_position(self.robot.gripper_handle) object_held = self.robot.is_object_held() current_state_id = 0 for state in self.states: if abs(state[0] - pos_obj[0]) < self.tolerance \ and abs(state[1] - pos_obj[1]) < self.tolerance \ and abs(state[2] - pos_obj[2]) < self.tolerance \ and state[3] == object_held \ and abs(state[4] - pos_arm[0]) < self.unit_step \ and abs(state[5] - pos_arm[1]) < self.unit_step \ and abs(state[6] - pos_arm[2]) < self.unit_step: return current_state_id current_state_id += 1 log_and_display('State was invalid: ' + str(pos_obj) + str(object_held) + str(pos_arm)) return self.invalid_states_index
if not os.path.exists(config.Q_TABLE_DIR): os.makedirs(config.Q_TABLE_DIR) vrep_ip = '127.0.0.1' vrep_port = 19997 env = Environment(vrep_ip, vrep_port) agent = Agent(env, epsilon=config.EPSILON, q_init_val=config.Q_INIT_VAL, discount=config.DISCOUNT, learn_rate=config.LEARN_RATE) episodes = config.NUM_EPISODES agent.load_qtable() log_and_display("%%%%%%%%%%%%%%%%%%%%%%%%% Main starts %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") log_and_display("Epsilon: " + str(config.EPSILON)) log_and_display("Epsilon Decay: " + str(config.EPSILON_DECAY)) log_and_display("Q Init: " + str(config.Q_INIT_VAL)) log_and_display("Discount: " + str(config.DISCOUNT)) log_and_display("Learning Rate: " + str(config.LEARN_RATE)) log_and_display("Max Episodes: " + str(config.NUM_EPISODES)) log_and_display("Max Actions/Episodes: " + str(config.NUM_MAX_ACTIONS)) log_and_display("Env Dimensions: " + str(config.ENV_DIMENSION)) if train_mode: # Here we train the agent and generate the qtable. # qtable will be dumped in qtables/qtable.txt.npy while episodes > 0: log_and_display('=============================================> Episode ' + str(episodes)) agent.reset()
def __init__(self, vrep_ip: str, vrep_port: int): """Prepares the actions, states and other environment variables """ self.robot = RobotArm(vrep_ip, vrep_port) self.tolerance = utility.rnd(config.TOLERANCE) self.unit_step = utility.rnd(config.UNIT_STEP_SIZE) dim = self.robot.get_env_dimensions() # Actions ######################################################### # The actions the agent can take - either goto some x,y,z position # or engage/disengage claw x_range_actions = np.arange(dim[0][0], dim[0][1], self.unit_step) y_range_actions = np.arange(dim[1][0], dim[1][1], self.unit_step) z_range_actions = np.arange(dim[2][0], dim[2][1], self.unit_step) # Actions consist of # a) Gripper Enable/Disable # b) Goto location (x, y, z) self.action_type1 = 'move_gripper' self.action_type2 = 'engage_gripper' self.actions = [] self.actions.append([self.action_type2, True]) self.actions.append([self.action_type2, False]) print(x_range_actions[1:-1]) print(y_range_actions[1:-1]) print(z_range_actions[1:-1]) for x in x_range_actions[1:-1]: for y in y_range_actions[1:-1]: for z in z_range_actions[1:-1]: self.actions.append([self.action_type1, [x, y, z]]) self.total_actions = len(self.actions) # States ######################################################### # States consist of # a) Position of the object (x, y, z coordinates) # b) If it is held by gripper or not # c) Position of the gripper (x, y, z coordinates) x_range = np.arange(dim[0][0], dim[0][1], self.tolerance) y_range = np.arange(dim[1][0], dim[1][1], self.tolerance) z_range = np.arange(dim[2][0], dim[2][1], self.tolerance) self.states = [] self.invalid_state = config.INVALID_STATE for x in x_range: for y in y_range: for z in z_range: for b in [True, False]: for xa in x_range_actions: for ya in y_range_actions: for za in z_range_actions: self.states.append( [x, y, z, b, xa, ya, za]) # invalid state, the last state. This state suggests that the object is outside the environment. self.states.append(self.invalid_state) self.total_states = len(self.states) self.invalid_states_index = self.total_states - 1 log_and_display("There are {0} actions.".format(self.total_actions)) log_and_display("There are {0} states.".format(self.total_states)) self.episode_object_gripped = False self.environment_breached = False self.is_success = False self.actionstate_prev = {} self.actionstate_curr = {}
def calculate_reward(env): """Implements the reward strategy, returns reward, environment_breached, is_success """ if not is_valid_state(env): log_and_display('Penalty: Reached invalid state, terminating') return config.REWARD_TERMINATION, True, False if not is_cylinder_standing(env): log_and_display('Penalty: Cylinder has fallen, terminating') return config.REWARD_TERMINATION, True, False if not is_bin_inplace(env): log_and_display('Penalty: Bin has shifted, terminating') return config.REWARD_TERMINATION, True, False if is_grip_engaged_with_no_object(env): log_and_display('Penalty: Claw is engaged but cylinder is not in claw') return config.REWARD_BAD_STEP, False, False if is_cylinder_not_dropped_in_bin(env): log_and_display('Penalty: Claw did not drop the cylinder in the bin') return config.REWARD_BAD_STEP, False, False if is_grip_holding_object(env): log_and_display('Reward: Claw could grab the cylinder for first time') return config.REWARD_FIRST_SUCCESS, False, False if is_object_in_bin(env): log_and_display('Reward: Cylinder in bucket. Objective achieved !!!!!!!!') return config.REWARD_GOAL_ACHIEVED, True, True return config.REWARD_DEFAULT, False, False # Default
def calculate_reward(env): """Implements the reward strategy, returns reward, environment_breached, is_success """ if not is_valid_state(env): log_and_display('Penalty: Reached invalid state, terminating') return config.REWARD_TERMINATION, True, False if not is_cylinder_standing(env): log_and_display('Penalty: Cylinder has fallen, terminating') return config.REWARD_TERMINATION, True, False if not is_bin_inplace(env): log_and_display('Penalty: Bin has shifted, terminating') return config.REWARD_TERMINATION, True, False if is_grip_engaged_with_no_object(env): log_and_display('Penalty: Claw is engaged but cylinder is not in claw') return config.REWARD_BAD_STEP, False, False if is_cylinder_not_dropped_in_bin(env): log_and_display('Penalty: Claw did not drop the cylinder in the bin') return config.REWARD_BAD_STEP, False, False if is_grip_holding_object(env): log_and_display('Reward: Claw could grab the cylinder for first time') return config.REWARD_FIRST_SUCCESS, False, False if is_object_in_bin(env): log_and_display( 'Reward: Cylinder in bucket. Objective achieved !!!!!!!!') return config.REWARD_GOAL_ACHIEVED, True, True return config.REWARD_DEFAULT, False, False # Default