def compute_reward(self, action, obs): obj = obs[4:7] gripper = self.tcp_center obj_to_target = np.linalg.norm(obj - self._target_pos) tcp_to_obj = np.linalg.norm(obj - gripper) in_place_margin = np.linalg.norm(self.obj_init_pos - self._target_pos) threshold = 0.03 # floor is a 3D funnel centered on the initial object pos radius = np.linalg.norm(gripper[:2] - self.obj_init_pos[:2]) if radius <= threshold: floor = 0.0 else: floor = 0.015 * np.log(radius - threshold) + 0.15 # prevent the hand from running into cliff edge by staying above floor above_floor = 1.0 if gripper[2] >= floor else reward_utils.tolerance( max(floor - gripper[2], 0.0), bounds=(0.0, 0.01), margin=0.02, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=0.01, obj_radius=0.015, pad_success_thresh=0.02, xz_thresh=0.03, desired_gripper_effort=0.1, high_density=True) in_place = reward_utils.tolerance(obj_to_target, bounds=(0, 0.02), margin=in_place_margin, sigmoid='long_tail') reward = reward_utils.hamacher_product(object_grasped, in_place) near_object = tcp_to_obj < 0.04 pinched_without_obj = obs[3] < 0.33 lifted = obj[2] - 0.02 > self.obj_init_pos[2] # Increase reward when properly grabbed obj grasp_success = near_object and lifted and not pinched_without_obj if grasp_success: reward += 1. + 5. * reward_utils.hamacher_product( in_place, above_floor) # Maximize reward on success if obj_to_target < self.TARGET_RADIUS: reward = 10. return ( reward, tcp_to_obj, grasp_success, obj_to_target, object_grasped, in_place, )
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos obj_to_target = np.linalg.norm(obj - target) tcp_to_obj = np.linalg.norm(obj - tcp) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action=action, obj_pos=obj, obj_radius=0.02, pad_success_thresh=0.05, object_reach_radius=0.01, xz_thresh=0.01, high_density=False) reward = reward_utils.hamacher_product(object_grasped, in_place) if (0.0 < obj[2] < 0.24 and \ (target[0]-0.15 < obj[0] < target[0]+0.15) and \ ((target[1] - 3*_TARGET_RADIUS) < obj[1] < target[1])): z_scaling = (0.24 - obj[2]) / 0.24 y_scaling = (obj[1] - (target[1] - 3 * _TARGET_RADIUS)) / (3 * _TARGET_RADIUS) bound_loss = reward_utils.hamacher_product(y_scaling, z_scaling) in_place = np.clip(in_place - bound_loss, 0.0, 1.0) if ((0.0 < obj[2] < 0.24) and \ (target[0]-0.15 < obj[0] < target[0]+0.15) and \ (obj[1] > target[1])): in_place = 0.0 if tcp_to_obj < 0.025 and (tcp_opened > 0) and \ (obj[2] - 0.01 > self.obj_init_pos[2]): reward += 1. + 5. * in_place if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place ]
def _gripper_caging_reward(self, action, obj_position): pad_success_margin = 0.05 x_z_success_margin = 0.005 obj_radius = 0.015 tcp = self.tcp_center left_pad = self.get_body_com('leftpad') right_pad = self.get_body_com('rightpad') delta_object_y_left_pad = left_pad[1] - obj_position[1] delta_object_y_right_pad = obj_position[1] - right_pad[1] right_caging_margin = abs(abs(obj_position[1] - self.init_right_pad[1]) - pad_success_margin) left_caging_margin = abs(abs(obj_position[1] - self.init_left_pad[1]) - pad_success_margin) right_caging = reward_utils.tolerance(delta_object_y_right_pad, bounds=(obj_radius, pad_success_margin), margin=right_caging_margin, sigmoid='long_tail',) left_caging = reward_utils.tolerance(delta_object_y_left_pad, bounds=(obj_radius, pad_success_margin), margin=left_caging_margin, sigmoid='long_tail',) y_caging = reward_utils.hamacher_product(left_caging, right_caging) # compute the tcp_obj distance in the x_z plane tcp_xz = tcp + np.array([0., -tcp[1], 0.]) obj_position_x_z = np.copy(obj_position) + np.array([0., -obj_position[1], 0.]) tcp_obj_norm_x_z = np.linalg.norm(tcp_xz - obj_position_x_z, ord=2) # used for computing the tcp to object object margin in the x_z plane init_obj_x_z = self.obj_init_pos + np.array([0., -self.obj_init_pos[1], 0.]) init_tcp_x_z = self.init_tcp + np.array([0., -self.init_tcp[1], 0.]) tcp_obj_x_z_margin = np.linalg.norm(init_obj_x_z - init_tcp_x_z, ord=2) - x_z_success_margin x_z_caging = reward_utils.tolerance(tcp_obj_norm_x_z, bounds=(0, x_z_success_margin), margin=tcp_obj_x_z_margin, sigmoid='long_tail',) gripper_closed = min(max(0, action[-1]), 1) caging = reward_utils.hamacher_product(y_caging, x_z_caging) gripping = gripper_closed if caging > 0.97 else 0. caging_and_gripping = reward_utils.hamacher_product(caging, gripping) caging_and_gripping = (caging_and_gripping + caging) / 2 return caging_and_gripping
def compute_reward(self, actions, obs): reward_grab = SawyerBoxCloseEnvV2._reward_grab_effort(actions) reward_quat = SawyerBoxCloseEnvV2._reward_quat(obs) reward_steps = SawyerBoxCloseEnvV2._reward_pos(obs, self._target_pos) reward = sum(( 2.0 * reward_utils.hamacher_product(reward_grab, reward_steps[0]), 8.0 * reward_steps[1], )) # Override reward on success success = np.linalg.norm(obs[4:7] - self._target_pos) < 0.08 if success: reward = 10.0 # STRONG emphasis on proper lid orientation to prevent reward hacking # (otherwise agent learns to kick-flip the lid onto the box) reward *= reward_quat return ( reward, reward_grab, *reward_steps, success, )
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos obj_to_target = np.linalg.norm(obj - target) tcp_to_obj = np.linalg.norm(obj - tcp) in_place_margin = (np.linalg.norm(self.obj_init_pos - target)) in_place = reward_utils.tolerance(obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail',) object_grasped = self._gripper_caging_reward(action, obj) in_place_and_object_grasped = reward_utils.hamacher_product(object_grasped, in_place) reward = in_place_and_object_grasped if tcp_to_obj < 0.02 and (tcp_opened > 0) and (obj[2] - 0.01 > self.obj_init_pos[2]): reward += 1. + 5. * in_place if obj_to_target < _TARGET_RADIUS: reward = 10. return [reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place]
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = np.array([self._target_pos[0], self._target_pos[1], obj[2]]) obj_to_target = np.linalg.norm(obj - target) tcp_to_obj = np.linalg.norm(obj - tcp) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action, obj, self.OBJ_RADIUS) in_place_and_object_grasped = reward_utils.hamacher_product( object_grasped, in_place) reward = (2 * object_grasped) + (6 * in_place_and_object_grasped) if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place ]
def compute_reward(self, action, obs): obj = obs[4:7] tcp_opened = obs[3] tcp_to_obj = np.linalg.norm(obj - self.tcp_center) target_to_obj = np.linalg.norm(obj - self._target_pos) target_to_obj_init = np.linalg.norm(self.obj_init_pos - self._target_pos) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=target_to_obj_init, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action, obj, self.OBJ_RADIUS) reward = reward_utils.hamacher_product(object_grasped, in_place) if (tcp_to_obj < 0.01) and (0 < tcp_opened < 0.55) and \ (target_to_obj_init - target_to_obj > 0.01): reward += 1. + 5. * in_place if target_to_obj < self.TARGET_RADIUS: reward = 10. return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): del action obj = obs[4:7] tcp = self.tcp_center tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(obj - self.init_tcp) obj_to_target = abs(self._target_pos[2] - obj[2]) tcp_closed = 1 - obs[3] near_button = reward_utils.tolerance( tcp_to_obj, bounds=(0, 0.01), margin=tcp_to_obj_init, sigmoid='long_tail', ) button_pressed = reward_utils.tolerance( obj_to_target, bounds=(0, 0.005), margin=self._obj_to_target_init, sigmoid='long_tail', ) reward = 5 * reward_utils.hamacher_product(tcp_closed, near_button) if tcp_to_obj <= 0.03: reward += 5 * button_pressed return (reward, tcp_to_obj, obs[3], obj_to_target, near_button, button_pressed)
def compute_reward(self, action, obs): gripper = obs[:3] lever = obs[4:7] # De-emphasize y error so that we get Sawyer's shoulder underneath the # lever prior to bumping on against scale = np.array([4., 1., 4.]) # Offset so that we get the Sawyer's shoulder underneath the lever, # rather than its fingers offset = np.array([.0, .055, .07]) shoulder_to_lever = (gripper + offset - lever) * scale shoulder_to_lever_init = (self.init_tcp + offset - self._lever_pos_init) * scale # This `ready_to_lift` reward should be a *hint* for the agent, not an # end in itself. Make sure to devalue it compared to the value of # actually lifting the lever ready_to_lift = reward_utils.tolerance( np.linalg.norm(shoulder_to_lever), bounds=(0, 0.02), margin=np.linalg.norm(shoulder_to_lever_init), sigmoid='long_tail', ) # The skill of the agent should be measured by its ability to get the # lever to point straight upward. This means we'll be measuring the # current angle of the lever's joint, and comparing with 90deg. lever_angle = -self.data.get_joint_qpos('LeverAxis') lever_angle_desired = np.pi / 2.0 lever_error = abs(lever_angle - lever_angle_desired) # We'll set the margin to 15deg from horizontal. Angles below that will # receive some reward to incentivize exploration, but we don't want to # reward accidents too much. Past 15deg is probably intentional movement lever_engagement = reward_utils.tolerance(lever_error, bounds=(0, np.pi / 48.0), margin=(np.pi / 2.0) - (np.pi / 12.0), sigmoid='long_tail') target = self._target_pos obj_to_target = np.linalg.norm(lever - target) in_place_margin = (np.linalg.norm(self._lever_pos_init - target)) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, 0.04), margin=in_place_margin, sigmoid='long_tail', ) # reward = 2.0 * ready_to_lift + 8.0 * lever_engagement reward = 10.0 * reward_utils.hamacher_product(ready_to_lift, in_place) return (reward, np.linalg.norm(shoulder_to_lever), ready_to_lift, lever_error, lever_engagement)
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] midpoint = np.array([self._target_pos[0], 0.77, 0.25]) target = self._target_pos tcp_to_obj = np.linalg.norm(obj - tcp) in_place_scaling = np.array([1., 1., 3.]) obj_to_midpoint = np.linalg.norm((obj - midpoint) * in_place_scaling) obj_to_midpoint_init = np.linalg.norm( (self.obj_init_pos - midpoint) * in_place_scaling) obj_to_target = np.linalg.norm(obj - target) obj_to_target_init = np.linalg.norm(self.obj_init_pos - target) in_place_part1 = reward_utils.tolerance( obj_to_midpoint, bounds=(0, _TARGET_RADIUS), margin=obj_to_midpoint_init, sigmoid='long_tail', ) in_place_part2 = reward_utils.tolerance(obj_to_target, bounds=(0, _TARGET_RADIUS), margin=obj_to_target_init, sigmoid='long_tail') object_grasped = self._gripper_caging_reward(action=action, obj_pos=obj, obj_radius=0.015, pad_success_thresh=0.05, object_reach_radius=0.01, xz_thresh=0.005, high_density=False) in_place_and_object_grasped = reward_utils.hamacher_product( object_grasped, in_place_part1) reward = in_place_and_object_grasped if tcp_to_obj < 0.02 and (tcp_opened > 0) and (obj[2] - 0.015 > self.obj_init_pos[2]): reward = in_place_and_object_grasped + 1. + 4. * in_place_part1 if obj[1] > 0.75: reward = in_place_and_object_grasped + 1. + 4. + 3. * in_place_part2 if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, np.linalg.norm(obj - target), object_grasped, in_place_part2 ]
def compute_reward(self, action, obs): obj = self._get_pos_objects() dial_push_position = self._get_pos_objects() + np.array([0.05, 0.02, 0.09]) tcp = self.tcp_center target = self._target_pos.copy() target_to_obj = (obj - target) target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self.dial_push_position - target) target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=abs(target_to_obj_init - self.TARGET_RADIUS), sigmoid='long_tail', ) dial_reach_radius = 0.005 tcp_to_obj = np.linalg.norm(dial_push_position - tcp) tcp_to_obj_init = np.linalg.norm(self.dial_push_position - self.init_tcp) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, dial_reach_radius), margin=abs(tcp_to_obj_init-dial_reach_radius), sigmoid='gaussian', ) gripper_closed = min(max(0, action[-1]), 1) reach = reward_utils.hamacher_product(reach, gripper_closed) tcp_opened = 0 object_grasped = reach reward = 10 * reward_utils.hamacher_product(reach, in_place) return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): obj = obs[4:7] tcp = self.tcp_center target = self._target_pos.copy() target_to_obj = (obj - target) target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self.obj_init_pos - target) target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=abs(target_to_obj_init - self.TARGET_RADIUS), sigmoid='long_tail', ) handle_reach_radius = 0.005 tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(self.obj_init_pos - self.init_tcp) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, handle_reach_radius), margin=abs(tcp_to_obj_init - handle_reach_radius), sigmoid='gaussian', ) gripper_closed = min(max(0, action[-1]), 1) reach = reward_utils.hamacher_product(reach, gripper_closed) tcp_opened = 0 object_grasped = reach reward = reward_utils.hamacher_product(reach, in_place) if target_to_obj <= self.TARGET_RADIUS + 0.015: reward = 1. reward *= 10 return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): obj = obs[4:7] target = self._target_pos.copy() # Emphasize X and Y errors scale = np.array([2., 2., 1.]) target_to_obj = (obj - target) * scale target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self.obj_init_pos - target) * scale target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, 0.05), margin=target_to_obj_init, sigmoid='long_tail', ) tcp_opened = obs[3] tcp_to_obj = np.linalg.norm(obj - self.tcp_center) object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=0.04, obj_radius=0.02, pad_success_thresh=0.05, xz_thresh=0.05, desired_gripper_effort=0.7, medium_density=True ) reward = reward_utils.hamacher_product(object_grasped, in_place) if tcp_to_obj < 0.04 and tcp_opened > 0: reward += 1. + 5. * in_place if target_to_obj < 0.05: reward = 10. return ( reward, tcp_to_obj, tcp_opened, np.linalg.norm(obj - target), # recompute to avoid `scale` above object_grasped, in_place )
def compute_reward(self, action, obs): obj = obs[4:7] # Force target to be slightly above basketball hoop target = self._target_pos.copy() target_to_obj = abs(target[2] - obj[2]) target_to_obj_init = abs(target[2] - self.obj_init_pos[2]) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=target_to_obj_init, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward( action, obj, pad_success_thresh=0.05, obj_radius=0.022, object_reach_radius=0.01, xz_thresh=0.01, high_density=True, ) reward = reward_utils.hamacher_product(object_grasped, in_place) tcp_opened = obs[3] tcp_to_obj = np.linalg.norm(obj - self.tcp_center) if tcp_to_obj < 0.035 and tcp_opened > 0 and \ obj[2] - 0.01 > self.obj_init_pos[2]: reward += 1. + 5. * in_place if target_to_obj < self.TARGET_RADIUS: reward = 10. return ( reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place )
def compute_reward(self, actions, obs): theta = self.data.get_joint_qpos('doorjoint') reward_grab = SawyerDoorEnvV2._reward_grab_effort(actions) reward_steps = SawyerDoorEnvV2._reward_pos(obs, theta) reward = sum(( 2.0 * reward_utils.hamacher_product(reward_steps[0], reward_grab), 8.0 * reward_steps[1], )) # Override reward on success flag if abs(obs[4] - self._target_pos[0]) <= 0.08: reward = 10.0 return ( reward, reward_grab, *reward_steps, )
def _reward_pos(obs, target_pos): hand = obs[:3] lid = obs[4:7] + np.array([.0, .0, .02]) threshold = 0.02 # floor is a 3D funnel centered on the lid's handle radius = np.linalg.norm(hand[:2] - lid[:2]) if radius <= threshold: floor = 0.0 else: floor = 0.04 * np.log(radius - threshold) + 0.4 # prevent the hand from running into the handle prematurely by keeping # it above the "floor" above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance( floor - hand[2], bounds=(0.0, 0.01), margin=floor / 2.0, sigmoid='long_tail', ) # grab the lid's handle in_place = reward_utils.tolerance( np.linalg.norm(hand - lid), bounds=(0, 0.02), margin=0.5, sigmoid='long_tail', ) ready_to_lift = reward_utils.hamacher_product(above_floor, in_place) # now actually put the lid on the box pos_error = target_pos - lid error_scale = np.array([1., 1., 3.]) # Emphasize Z error a = 0.2 # Relative importance of just *trying* to lift the lid at all b = 0.8 # Relative importance of placing the lid on the box lifted = a * float(lid[2] > 0.04) + b * reward_utils.tolerance( np.linalg.norm(pos_error * error_scale), bounds=(0, 0.05), margin=0.25, sigmoid='long_tail', ) return ready_to_lift, lifted
def _reward_pos(obs, theta): hand = obs[:3] door = obs[4:7] + np.array([-0.05, 0, 0]) threshold = 0.12 # floor is a 3D funnel centered on the door handle radius = np.linalg.norm(hand[:2] - door[:2]) if radius <= threshold: floor = 0.0 else: floor = 0.04 * np.log(radius - threshold) + 0.4 # prevent the hand from running into the handle prematurely by keeping # it above the "floor" above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance( floor - hand[2], bounds=(0.0, 0.01), margin=floor / 2.0, sigmoid='long_tail', ) # move the hand to a position between the handle and the main door body in_place = reward_utils.tolerance( np.linalg.norm(hand - door - np.array([0.05, 0.03, -0.01])), bounds=(0, threshold / 2.0), margin=0.5, sigmoid='long_tail', ) ready_to_open = reward_utils.hamacher_product(above_floor, in_place) # now actually open the door door_angle = -theta a = 0.2 # Relative importance of just *trying* to open the door at all b = 0.8 # Relative importance of fully opening the door opened = a * float(theta < -np.pi / 90.) + b * reward_utils.tolerance( np.pi / 2. + np.pi / 6 - door_angle, bounds=(0, 0.5), margin=np.pi / 3., sigmoid='long_tail', ) return ready_to_open, opened
def compute_reward(self, action, obs): del action obj = obs[4:7] tcp = self.tcp_center tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(obj - self.init_tcp) obj_to_target = abs(self._target_pos[1] - obj[1]) near_button = reward_utils.tolerance( tcp_to_obj, bounds=(0, 0.01), margin=tcp_to_obj_init, sigmoid='long_tail', ) button_pressed = reward_utils.tolerance( obj_to_target, bounds=(0, 0.005), margin=self._obj_to_target_init, sigmoid='long_tail', ) reward = 0.0 if tcp_to_obj > 0.07: tcp_status = (1 - obs[3]) / 2.0 reward = 2 * reward_utils.hamacher_product(tcp_status, near_button) else: reward = 2 reward += 2 * (1 + obs[3]) reward += 4 * button_pressed ** 2 return ( reward, tcp_to_obj, obs[3], obj_to_target, near_button, button_pressed )
def compute_reward(self, action, obs): obj = obs[4:7] target_to_obj = np.linalg.norm(obj - self._target_pos) target_to_obj_init = np.linalg.norm(self.obj_init_pos - self._target_pos) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=target_to_obj_init, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=0.01, obj_radius=0.015, pad_success_thresh=0.05, xz_thresh=0.005, high_density=True ) reward = reward_utils.hamacher_product(object_grasped, in_place) tcp_opened = obs[3] tcp_to_obj = np.linalg.norm(obj - self.tcp_center) if tcp_to_obj < 0.02 and tcp_opened > 0: reward += 1. + 7. * in_place if target_to_obj < self.TARGET_RADIUS: reward = 10. return ( reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place )
def compute_reward(self, actions, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos obj_to_target = np.linalg.norm(obj - target) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin - _TARGET_RADIUS, sigmoid='long_tail', ) tcp_to_obj = np.linalg.norm(tcp - obj) obj_grasped_margin = np.linalg.norm(self.init_tcp - self.obj_init_pos) object_grasped = reward_utils.tolerance( tcp_to_obj, bounds=(0, _TARGET_RADIUS), margin=obj_grasped_margin - _TARGET_RADIUS, sigmoid='long_tail', ) in_place_and_object_grasped = reward_utils.hamacher_product( object_grasped, in_place) reward = 1.5 * object_grasped if tcp[2] <= 0.03 and tcp_to_obj < 0.07: reward = 2 + (7 * in_place) if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place ]
def compute_reward(self, actions, obs): del actions objPos = obs[4:7] obj = self._get_pos_objects() tcp = self.tcp_center target = self._target_pos.copy() target_to_obj = (obj[2] - target[2]) target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self._handle_init_pos[2] - target[2]) target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=abs(target_to_obj_init - self.TARGET_RADIUS), sigmoid='long_tail', ) handle_radius = 0.02 tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(self._handle_init_pos - self.init_tcp) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, handle_radius), margin=abs(tcp_to_obj_init - handle_radius), sigmoid='long_tail', ) tcp_opened = 0 object_grasped = reach reward = reward_utils.hamacher_product(reach, in_place) reward = 1 if target_to_obj <= self.TARGET_RADIUS else reward reward *= 10 return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): del action obj = obs[4:7] tcp = self.get_body_com('leftpad') scale = np.array([0.25, 1., 0.5]) tcp_to_obj = np.linalg.norm((obj - tcp) * scale) tcp_to_obj_init = np.linalg.norm((obj - self.init_left_pad) * scale) obj_to_target = abs(self._target_pos[2] - obj[2]) tcp_opened = max(obs[3], 0.0) near_lock = reward_utils.tolerance( tcp_to_obj, bounds=(0, 0.01), margin=tcp_to_obj_init, sigmoid='long_tail', ) lock_pressed = reward_utils.tolerance( obj_to_target, bounds=(0, 0.005), margin=self._lock_length, sigmoid='long_tail', ) reward = 2 * reward_utils.hamacher_product(tcp_opened, near_lock) reward += 8 * lock_pressed return ( reward, tcp_to_obj, obs[3], obj_to_target, near_lock, lock_pressed )
def compute_reward(self, action, obs): hand = obs[:3] obj = obs[4:7] target_to_obj = np.linalg.norm(obj - self._target_pos) if self._target_to_obj_init is None: self._target_to_obj_init = target_to_obj in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=self._target_to_obj_init, sigmoid='long_tail', ) threshold = 0.03 radii = [ np.linalg.norm(hand[:2] - self.obj_init_pos[:2]), np.linalg.norm(hand[:2] - self._target_pos[:2]) ] # floor is a *pair* of 3D funnels centered on (1) the object's initial # position and (2) the desired final position floor = min([ 0.02 * np.log(radius - threshold) + 0.2 if radius > threshold else 0.0 for radius in radii ]) # prevent the hand from running into the edge of the bins by keeping # it above the "floor" above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance( max(floor - hand[2], 0.0), bounds=(0.0, 0.01), margin=0.05, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward( action, obj, obj_radius=0.015, pad_success_thresh=0.05, object_reach_radius=0.01, xz_thresh=0.01, desired_gripper_effort=0.7, high_density=True, ) reward = reward_utils.hamacher_product(object_grasped, in_place) near_object = np.linalg.norm(obj - hand) < 0.04 pinched_without_obj = obs[3] < 0.43 lifted = obj[2] - 0.02 > self.obj_init_pos[2] # Increase reward when properly grabbed obj grasp_success = near_object and lifted and not pinched_without_obj if grasp_success: reward += 1. + 5. * reward_utils.hamacher_product( above_floor, in_place) # Maximize reward on success if target_to_obj < self.TARGET_RADIUS: reward = 10. return (reward, near_object, grasp_success, target_to_obj, object_grasped, in_place)
def _gripper_caging_reward(self, action, obj_position, obj_radius): pad_success_margin = 0.05 grip_success_margin = obj_radius + 0.005 x_z_success_margin = 0.01 tcp = self.tcp_center left_pad = self.get_body_com('leftpad') right_pad = self.get_body_com('rightpad') delta_object_y_left_pad = left_pad[1] - obj_position[1] delta_object_y_right_pad = obj_position[1] - right_pad[1] right_caging_margin = abs( abs(obj_position[1] - self.init_right_pad[1]) - pad_success_margin) left_caging_margin = abs( abs(obj_position[1] - self.init_left_pad[1]) - pad_success_margin) right_caging = reward_utils.tolerance( delta_object_y_right_pad, bounds=(obj_radius, pad_success_margin), margin=right_caging_margin, sigmoid='long_tail', ) left_caging = reward_utils.tolerance( delta_object_y_left_pad, bounds=(obj_radius, pad_success_margin), margin=left_caging_margin, sigmoid='long_tail', ) right_gripping = reward_utils.tolerance( delta_object_y_right_pad, bounds=(obj_radius, grip_success_margin), margin=right_caging_margin, sigmoid='long_tail', ) left_gripping = reward_utils.tolerance( delta_object_y_left_pad, bounds=(obj_radius, grip_success_margin), margin=left_caging_margin, sigmoid='long_tail', ) assert right_caging >= 0 and right_caging <= 1 assert left_caging >= 0 and left_caging <= 1 y_caging = reward_utils.hamacher_product(right_caging, left_caging) y_gripping = reward_utils.hamacher_product(right_gripping, left_gripping) assert y_caging >= 0 and y_caging <= 1 tcp_xz = tcp + np.array([0., -tcp[1], 0.]) obj_position_x_z = np.copy(obj_position) + np.array( [0., -obj_position[1], 0.]) tcp_obj_norm_x_z = np.linalg.norm(tcp_xz - obj_position_x_z, ord=2) init_obj_x_z = self.obj_init_pos + np.array( [0., -self.obj_init_pos[1], 0.]) init_tcp_x_z = self.init_tcp + np.array([0., -self.init_tcp[1], 0.]) tcp_obj_x_z_margin = np.linalg.norm(init_obj_x_z - init_tcp_x_z, ord=2) - x_z_success_margin x_z_caging = reward_utils.tolerance( tcp_obj_norm_x_z, bounds=(0, x_z_success_margin), margin=tcp_obj_x_z_margin, sigmoid='long_tail', ) assert right_caging >= 0 and right_caging <= 1 gripper_closed = min(max(0, action[-1]), 1) assert gripper_closed >= 0 and gripper_closed <= 1 caging = reward_utils.hamacher_product(y_caging, x_z_caging) assert caging >= 0 and caging <= 1 if caging > 0.95: gripping = y_gripping else: gripping = 0. assert gripping >= 0 and gripping <= 1 caging_and_gripping = (caging + gripping) / 2 assert caging_and_gripping >= 0 and caging_and_gripping <= 1 return caging_and_gripping
def compute_reward(self, action, obs): tcp = self.tcp_center obj = obs[4:7] obj_head = self._get_site_pos('pegHead') tcp_opened = obs[3] target = self._target_pos tcp_to_obj = np.linalg.norm(obj - tcp) scale = np.array([1., 2., 2.]) # force agent to pick up object then insert obj_to_target = np.linalg.norm((obj_head - target) * scale) in_place_margin = np.linalg.norm( (self.peg_head_pos_init - target) * scale) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, self.TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail', ) ip_orig = in_place brc_col_box_1 = self._get_site_pos( 'bottom_right_corner_collision_box_1') tlc_col_box_1 = self._get_site_pos('top_left_corner_collision_box_1') brc_col_box_2 = self._get_site_pos( 'bottom_right_corner_collision_box_2') tlc_col_box_2 = self._get_site_pos('top_left_corner_collision_box_2') collision_box_bottom_1 = reward_utils.rect_prism_tolerance( curr=obj_head, one=tlc_col_box_1, zero=brc_col_box_1) collision_box_bottom_2 = reward_utils.rect_prism_tolerance( curr=obj_head, one=tlc_col_box_2, zero=brc_col_box_2) collision_boxes = reward_utils.hamacher_product( collision_box_bottom_2, collision_box_bottom_1) in_place = reward_utils.hamacher_product(in_place, collision_boxes) pad_success_margin = 0.03 object_reach_radius = 0.01 x_z_margin = 0.005 obj_radius = 0.0075 object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=object_reach_radius, obj_radius=obj_radius, pad_success_thresh=pad_success_margin, xz_thresh=x_z_margin, high_density=True) if tcp_to_obj < 0.08 and (tcp_opened > 0) and (obj[2] - 0.01 > self.obj_init_pos[2]): object_grasped = 1. in_place_and_object_grasped = reward_utils.hamacher_product( object_grasped, in_place) reward = in_place_and_object_grasped if tcp_to_obj < 0.08 and (tcp_opened > 0) and (obj[2] - 0.01 > self.obj_init_pos[2]): reward += 1. + 5 * in_place if obj_to_target <= 0.07: reward = 10. return [ reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place, collision_boxes, ip_orig ]
def _gripper_caging_reward(self, action, obj_pos, obj_radius, pad_success_thresh, object_reach_radius, xz_thresh, desired_gripper_effort=1.0, high_density=False, medium_density=False): """Reward for agent grasping obj Args: action(np.ndarray): (4,) array representing the action delta(x), delta(y), delta(z), gripper_effort obj_pos(np.ndarray): (3,) array representing the obj x,y,z obj_radius(float):radius of object's bounding sphere pad_success_thresh(float): successful distance of gripper_pad to object object_reach_radius(float): successful distance of gripper center to the object. xz_thresh(float): successful distance of gripper in x_z axis to the object. Y axis not included since the caging function handles successful grasping in the Y axis. """ if high_density and medium_density: raise ValueError("Can only be either high_density or medium_density") # MARK: Left-right gripper information for caging reward---------------- left_pad = self.get_body_com('leftpad') right_pad = self.get_body_com('rightpad') # get current positions of left and right pads (Y axis) pad_y_lr = np.hstack((left_pad[1], right_pad[1])) # compare *current* pad positions with *current* obj position (Y axis) pad_to_obj_lr = np.abs(pad_y_lr - obj_pos[1]) # compare *current* pad positions with *initial* obj position (Y axis) pad_to_objinit_lr = np.abs(pad_y_lr - self.stick_init_pos[1]) caging_lr_margin = np.abs(pad_to_objinit_lr - pad_success_thresh) caging_lr = [reward_utils.tolerance( pad_to_obj_lr[i], # "x" in the description above bounds=(obj_radius, pad_success_thresh), margin=caging_lr_margin[i], # "margin" in the description above sigmoid='long_tail', ) for i in range(2)] caging_y = reward_utils.hamacher_product(*caging_lr) # MARK: X-Z gripper information for caging reward----------------------- tcp = self.tcp_center xz = [0, 2] caging_xz_margin = np.linalg.norm(self.stick_init_pos[xz] - self.init_tcp[xz]) caging_xz_margin -= xz_thresh caging_xz = reward_utils.tolerance( np.linalg.norm(tcp[xz] - obj_pos[xz]), # "x" in the description above bounds=(0, xz_thresh), margin=caging_xz_margin, # "margin" in the description above sigmoid='long_tail', ) # MARK: Closed-extent gripper information for caging reward------------- gripper_closed = min(max(0, action[-1]), desired_gripper_effort) \ / desired_gripper_effort # MARK: Combine components---------------------------------------------- caging = reward_utils.hamacher_product(caging_y, caging_xz) gripping = gripper_closed if caging > 0.97 else 0. caging_and_gripping = reward_utils.hamacher_product(caging, gripping) if high_density: caging_and_gripping = (caging_and_gripping + caging) / 2 if medium_density: tcp = self.tcp_center tcp_to_obj = np.linalg.norm(obj_pos - tcp) tcp_to_obj_init = np.linalg.norm(self.stick_init_pos - self.init_tcp) reach_margin = abs(tcp_to_obj_init - object_reach_radius) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, object_reach_radius), margin=reach_margin, sigmoid='long_tail', ) caging_and_gripping = (caging_and_gripping + reach) / 2 return caging_and_gripping
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center stick = obs[4:7] end_of_stick = self._get_site_pos('stick_end') container = obs[11:14] + np.array([0.05, 0., 0.]) container_init_pos = self.obj_init_pos + np.array([0.05, 0., 0.]) handle = obs[11:14] tcp_opened = obs[3] target = self._target_pos tcp_to_stick = np.linalg.norm(stick - tcp) handle_to_target = np.linalg.norm(handle - target) yz_scaling = np.array([1., 1., 2.]) stick_to_container = np.linalg.norm((stick - container) * yz_scaling) stick_in_place_margin = (np.linalg.norm( (self.stick_init_pos - container_init_pos) * yz_scaling)) stick_in_place = reward_utils.tolerance( stick_to_container, bounds=(0, _TARGET_RADIUS), margin=stick_in_place_margin, sigmoid='long_tail', ) stick_to_target = np.linalg.norm(stick - target) stick_in_place_margin_2 = np.linalg.norm(self.stick_init_pos - target) stick_in_place_2 = reward_utils.tolerance( stick_to_target, bounds=(0, _TARGET_RADIUS), margin=stick_in_place_margin_2, sigmoid='long_tail', ) container_to_target = np.linalg.norm(container - target) container_in_place_margin = np.linalg.norm(self.obj_init_pos - target) container_in_place = reward_utils.tolerance( container_to_target, bounds=(0, _TARGET_RADIUS), margin=container_in_place_margin, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action=action, obj_pos=stick, obj_radius=0.014, pad_success_thresh=0.05, object_reach_radius=0.01, xz_thresh=0.01, high_density=True) grasp_success = (tcp_to_stick < 0.02 and (tcp_opened > 0) and (stick[2] - 0.01 > self.stick_init_pos[2])) object_grasped = 1 if grasp_success else object_grasped in_place_and_object_grasped = reward_utils.hamacher_product( object_grasped, stick_in_place) reward = in_place_and_object_grasped if grasp_success: reward = 1. + in_place_and_object_grasped + 5. * stick_in_place if self._stick_is_inserted(handle, end_of_stick): reward = 1. + in_place_and_object_grasped + 5. + \ 2. * stick_in_place_2 + 1. * container_in_place if handle_to_target <= 0.12: reward = 10. return [ reward, tcp_to_stick, tcp_opened, handle_to_target, object_grasped, stick_in_place ]
def _gripper_caging_reward(self, action, obj_pos, obj_radius, pad_success_thresh, object_reach_radius, xz_thresh, desired_gripper_effort=1.0, high_density=False, medium_density=False): """Reward for agent grasping obj Args: action(np.ndarray): (4,) array representing the action delta(x), delta(y), delta(z), gripper_effort obj_pos(np.ndarray): (3,) array representing the obj x,y,z obj_radius(float):radius of object's bounding sphere pad_success_thresh(float): successful distance of gripper_pad to object object_reach_radius(float): successful distance of gripper center to the object. xz_thresh(float): successful distance of gripper in x_z axis to the object. Y axis not included since the caging function handles successful grasping in the Y axis. """ if high_density and medium_density: raise ValueError( "Can only be either high_density or medium_density") # MARK: Left-right gripper information for caging reward---------------- left_pad = self.get_body_com('leftpad') right_pad = self.get_body_com('rightpad') # get current positions of left and right pads (Y axis) pad_y_lr = np.hstack((left_pad[1], right_pad[1])) # compare *current* pad positions with *current* obj position (Y axis) pad_to_obj_lr = np.abs(pad_y_lr - obj_pos[1]) # compare *current* pad positions with *initial* obj position (Y axis) pad_to_objinit_lr = np.abs(pad_y_lr - self.obj_init_pos[1]) # Compute the left/right caging rewards. This is crucial for success, # yet counterintuitive mathematically because we invented it # accidentally. # # Before touching the object, `pad_to_obj_lr` ("x") is always separated # from `caging_lr_margin` ("the margin") by some small number, # `pad_success_thresh`. # # When far away from the object: # x = margin + pad_success_thresh # --> Thus x is outside the margin, yielding very small reward. # Here, any variation in the reward is due to the fact that # the margin itself is shifting. # When near the object (within pad_success_thresh): # x = pad_success_thresh - margin # --> Thus x is well within the margin. As long as x > obj_radius, # it will also be within the bounds, yielding maximum reward. # Here, any variation in the reward is due to the gripper # moving *too close* to the object (i.e, blowing past the # obj_radius bound). # # Therefore, before touching the object, this is very nearly a binary # reward -- if the gripper is between obj_radius and pad_success_thresh, # it gets maximum reward. Otherwise, the reward very quickly falls off. # # After grasping the object and moving it away from initial position, # x remains (mostly) constant while the margin grows considerably. This # penalizes the agent if it moves *back* toward `obj_init_pos`, but # offers no encouragement for leaving that position in the first place. # That part is left to the reward functions of individual environments. caging_lr_margin = np.abs(pad_to_objinit_lr - pad_success_thresh) caging_lr = [ reward_utils.tolerance( pad_to_obj_lr[i], # "x" in the description above bounds=(obj_radius, pad_success_thresh), margin=caging_lr_margin[ i], # "margin" in the description above sigmoid='long_tail', ) for i in range(2) ] caging_y = reward_utils.hamacher_product(*caging_lr) # MARK: X-Z gripper information for caging reward----------------------- tcp = self.tcp_center xz = [0, 2] # Compared to the caging_y reward, caging_xz is simple. The margin is # constant (something in the 0.3 to 0.5 range) and x shrinks as the # gripper moves towards the object. After picking up the object, the # reward is maximized and changes very little caging_xz_margin = np.linalg.norm(self.obj_init_pos[xz] - self.init_tcp[xz]) caging_xz_margin -= xz_thresh caging_xz = reward_utils.tolerance( np.linalg.norm(tcp[xz] - obj_pos[xz]), # "x" in the description above bounds=(0, xz_thresh), margin=caging_xz_margin, # "margin" in the description above sigmoid='long_tail', ) # MARK: Closed-extent gripper information for caging reward------------- gripper_closed = min(max(0, action[-1]), desired_gripper_effort) \ / desired_gripper_effort # MARK: Combine components---------------------------------------------- caging = reward_utils.hamacher_product(caging_y, caging_xz) gripping = gripper_closed if caging > 0.97 else 0. caging_and_gripping = reward_utils.hamacher_product(caging, gripping) if high_density: caging_and_gripping = (caging_and_gripping + caging) / 2 if medium_density: tcp = self.tcp_center tcp_to_obj = np.linalg.norm(obj_pos - tcp) tcp_to_obj_init = np.linalg.norm(self.obj_init_pos - self.init_tcp) # Compute reach reward # - We subtract `object_reach_radius` from the margin so that the # reward always starts with a value of 0.1 reach_margin = abs(tcp_to_obj_init - object_reach_radius) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, object_reach_radius), margin=reach_margin, sigmoid='long_tail', ) caging_and_gripping = (caging_and_gripping + reach) / 2 return caging_and_gripping