def compute_reward(self, action, obs): del action obj = obs[4:7] tcp = self.tcp_center tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(obj - self.init_tcp) obj_to_target = abs(self._target_pos[2] - obj[2]) tcp_closed = 1 - obs[3] near_button = reward_utils.tolerance( tcp_to_obj, bounds=(0, 0.01), margin=tcp_to_obj_init, sigmoid='long_tail', ) button_pressed = reward_utils.tolerance( obj_to_target, bounds=(0, 0.005), margin=self._obj_to_target_init, sigmoid='long_tail', ) reward = 5 * reward_utils.hamacher_product(tcp_closed, near_button) if tcp_to_obj <= 0.03: reward += 5 * button_pressed return (reward, tcp_to_obj, obs[3], obj_to_target, near_button, button_pressed)
def compute_reward(self, action, obs): gripper = obs[:3] handle = obs[4:7] handle_error = np.linalg.norm(handle - self._target_pos) reward_for_opening = reward_utils.tolerance(handle_error, bounds=(0, 0.02), margin=self.maxDist, sigmoid='long_tail') handle_pos_init = self._target_pos + np.array([.0, self.maxDist, .0]) # Emphasize XY error so that gripper is able to drop down and cage # handle without running into it. By doing this, we are assuming # that the reward in the Z direction is small enough that the agent # will be willing to explore raising a finger above the handle, hook it, # and drop back down to re-gain Z reward scale = np.array([3., 3., 1.]) gripper_error = (handle - gripper) * scale gripper_error_init = (handle_pos_init - self.init_tcp) * scale reward_for_caging = reward_utils.tolerance( np.linalg.norm(gripper_error), bounds=(0, 0.01), margin=np.linalg.norm(gripper_error_init), sigmoid='long_tail') reward = reward_for_caging + reward_for_opening reward *= 5.0 return (reward, np.linalg.norm(handle - gripper), obs[3], handle_error, reward_for_caging, reward_for_opening)
def compute_reward(self, actions, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] target = self._target_pos tcp_to_target = np.linalg.norm(tcp - target) tcp_to_obj = np.linalg.norm(tcp - obj) obj_to_target = np.linalg.norm(obj - target) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance(obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='gaussian',) hand_margin = np.linalg.norm(self.hand_init_pos - obj) + 0.1 hand_in_place = reward_utils.tolerance(tcp_to_target, bounds=(0, 0.25*_TARGET_RADIUS), margin=hand_margin, sigmoid='gaussian',) reward = 3 * hand_in_place + 6 * in_place if obj_to_target < _TARGET_RADIUS: reward = 10 return [reward, obj_to_target, hand_in_place]
def compute_reward(self, action, obs): gripper = obs[:3] lever = obs[4:7] # De-emphasize y error so that we get Sawyer's shoulder underneath the # lever prior to bumping on against scale = np.array([4., 1., 4.]) # Offset so that we get the Sawyer's shoulder underneath the lever, # rather than its fingers offset = np.array([.0, .055, .07]) shoulder_to_lever = (gripper + offset - lever) * scale shoulder_to_lever_init = (self.init_tcp + offset - self._lever_pos_init) * scale # This `ready_to_lift` reward should be a *hint* for the agent, not an # end in itself. Make sure to devalue it compared to the value of # actually lifting the lever ready_to_lift = reward_utils.tolerance( np.linalg.norm(shoulder_to_lever), bounds=(0, 0.02), margin=np.linalg.norm(shoulder_to_lever_init), sigmoid='long_tail', ) # The skill of the agent should be measured by its ability to get the # lever to point straight upward. This means we'll be measuring the # current angle of the lever's joint, and comparing with 90deg. lever_angle = -self.data.get_joint_qpos('LeverAxis') lever_angle_desired = np.pi / 2.0 lever_error = abs(lever_angle - lever_angle_desired) # We'll set the margin to 15deg from horizontal. Angles below that will # receive some reward to incentivize exploration, but we don't want to # reward accidents too much. Past 15deg is probably intentional movement lever_engagement = reward_utils.tolerance(lever_error, bounds=(0, np.pi / 48.0), margin=(np.pi / 2.0) - (np.pi / 12.0), sigmoid='long_tail') target = self._target_pos obj_to_target = np.linalg.norm(lever - target) in_place_margin = (np.linalg.norm(self._lever_pos_init - target)) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, 0.04), margin=in_place_margin, sigmoid='long_tail', ) # reward = 2.0 * ready_to_lift + 8.0 * lever_engagement reward = 10.0 * reward_utils.hamacher_product(ready_to_lift, in_place) return (reward, np.linalg.norm(shoulder_to_lever), ready_to_lift, lever_error, lever_engagement)
def compute_reward(self, action, obs): obj = obs[4:7] gripper = self.tcp_center obj_to_target = np.linalg.norm(obj - self._target_pos) tcp_to_obj = np.linalg.norm(obj - gripper) in_place_margin = np.linalg.norm(self.obj_init_pos - self._target_pos) threshold = 0.03 # floor is a 3D funnel centered on the initial object pos radius = np.linalg.norm(gripper[:2] - self.obj_init_pos[:2]) if radius <= threshold: floor = 0.0 else: floor = 0.015 * np.log(radius - threshold) + 0.15 # prevent the hand from running into cliff edge by staying above floor above_floor = 1.0 if gripper[2] >= floor else reward_utils.tolerance( max(floor - gripper[2], 0.0), bounds=(0.0, 0.01), margin=0.02, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=0.01, obj_radius=0.015, pad_success_thresh=0.02, xz_thresh=0.03, desired_gripper_effort=0.1, high_density=True) in_place = reward_utils.tolerance(obj_to_target, bounds=(0, 0.02), margin=in_place_margin, sigmoid='long_tail') reward = reward_utils.hamacher_product(object_grasped, in_place) near_object = tcp_to_obj < 0.04 pinched_without_obj = obs[3] < 0.33 lifted = obj[2] - 0.02 > self.obj_init_pos[2] # Increase reward when properly grabbed obj grasp_success = near_object and lifted and not pinched_without_obj if grasp_success: reward += 1. + 5. * reward_utils.hamacher_product( in_place, above_floor) # Maximize reward on success if obj_to_target < self.TARGET_RADIUS: reward = 10. return ( reward, tcp_to_obj, grasp_success, obj_to_target, object_grasped, in_place, )
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] midpoint = np.array([-0.05, 0.77, obj[2]]) target = self._target_pos tcp_to_obj = np.linalg.norm(obj - tcp) in_place_scaling = np.array([3., 1., 1.]) obj_to_midpoint = np.linalg.norm((obj - midpoint) * in_place_scaling) obj_to_midpoint_init = np.linalg.norm((self.obj_init_pos - midpoint) * in_place_scaling) obj_to_target = np.linalg.norm(obj - target) obj_to_target_init = np.linalg.norm(self.obj_init_pos - target) in_place_part1 = reward_utils.tolerance(obj_to_midpoint, bounds=(0, _TARGET_RADIUS), margin=obj_to_midpoint_init, sigmoid='long_tail', ) in_place_part2 = reward_utils.tolerance(obj_to_target, bounds=(0, _TARGET_RADIUS), margin=obj_to_target_init, sigmoid='long_tail' ) object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=0.01, obj_radius=0.015, pad_success_thresh=0.05, xz_thresh=0.005, high_density=True ) reward = 2 * object_grasped if tcp_to_obj < 0.02 and tcp_opened > 0: reward = 2 * object_grasped + 1. + 4. * in_place_part1 if obj[1] > 0.75: reward = 2 * object_grasped + 1. + 4. + 3. * in_place_part2 if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, np.linalg.norm(obj - target), object_grasped, in_place_part2 ]
def _gripper_caging_reward(self, action, obj_position): pad_success_margin = 0.05 x_z_success_margin = 0.005 obj_radius = 0.015 tcp = self.tcp_center left_pad = self.get_body_com('leftpad') right_pad = self.get_body_com('rightpad') delta_object_y_left_pad = left_pad[1] - obj_position[1] delta_object_y_right_pad = obj_position[1] - right_pad[1] right_caging_margin = abs(abs(obj_position[1] - self.init_right_pad[1]) - pad_success_margin) left_caging_margin = abs(abs(obj_position[1] - self.init_left_pad[1]) - pad_success_margin) right_caging = reward_utils.tolerance(delta_object_y_right_pad, bounds=(obj_radius, pad_success_margin), margin=right_caging_margin, sigmoid='long_tail',) left_caging = reward_utils.tolerance(delta_object_y_left_pad, bounds=(obj_radius, pad_success_margin), margin=left_caging_margin, sigmoid='long_tail',) y_caging = reward_utils.hamacher_product(left_caging, right_caging) # compute the tcp_obj distance in the x_z plane tcp_xz = tcp + np.array([0., -tcp[1], 0.]) obj_position_x_z = np.copy(obj_position) + np.array([0., -obj_position[1], 0.]) tcp_obj_norm_x_z = np.linalg.norm(tcp_xz - obj_position_x_z, ord=2) # used for computing the tcp to object object margin in the x_z plane init_obj_x_z = self.obj_init_pos + np.array([0., -self.obj_init_pos[1], 0.]) init_tcp_x_z = self.init_tcp + np.array([0., -self.init_tcp[1], 0.]) tcp_obj_x_z_margin = np.linalg.norm(init_obj_x_z - init_tcp_x_z, ord=2) - x_z_success_margin x_z_caging = reward_utils.tolerance(tcp_obj_norm_x_z, bounds=(0, x_z_success_margin), margin=tcp_obj_x_z_margin, sigmoid='long_tail',) gripper_closed = min(max(0, action[-1]), 1) caging = reward_utils.hamacher_product(y_caging, x_z_caging) gripping = gripper_closed if caging > 0.97 else 0. caging_and_gripping = reward_utils.hamacher_product(caging, gripping) caging_and_gripping = (caging_and_gripping + caging) / 2 return caging_and_gripping
def compute_reward(self, action, obs): obj = obs[4:7] tcp_opened = obs[3] tcp_to_obj = np.linalg.norm(obj - self.tcp_center) target_to_obj = np.linalg.norm(obj - self._target_pos) target_to_obj_init = np.linalg.norm(self.obj_init_pos - self._target_pos) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=target_to_obj_init, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action, obj, object_reach_radius=0.01, obj_radius=0.015, pad_success_thresh=0.05, xz_thresh=0.005, high_density=True) reward = 2 * object_grasped if tcp_to_obj < 0.02 and tcp_opened > 0: reward += 1. + reward + 5. * in_place if target_to_obj < self.TARGET_RADIUS: reward = 10. return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): obj = obs[4:7] tcp_opened = obs[3] x_scaling = np.array([3., 1., 1.]) tcp_to_obj = np.linalg.norm(obj - self.tcp_center) target_to_obj = np.linalg.norm((obj - self._target_pos) * x_scaling) target_to_obj_init = np.linalg.norm( (obj - self.obj_init_pos) * x_scaling) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=target_to_obj_init, sigmoid='long_tail', ) goal_line = (self._target_pos[1] - 0.1) if obj[1] > goal_line and abs(obj[0] - self._target_pos[0]) > 0.10: in_place = np.clip( in_place - 2 * ((obj[1] - goal_line) / (1 - goal_line)), 0., 1.) object_grasped = self._gripper_caging_reward(action, obj, self.OBJ_RADIUS) reward = (3 * object_grasped) + (6.5 * in_place) if target_to_obj < self.TARGET_RADIUS: reward = 10. return (reward, tcp_to_obj, tcp_opened, np.linalg.norm(obj - self._target_pos), object_grasped, in_place)
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos obj_to_target = np.linalg.norm(obj - target) tcp_to_obj = np.linalg.norm(obj - tcp) in_place_margin = (np.linalg.norm(self.obj_init_pos - target)) in_place = reward_utils.tolerance(obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail',) object_grasped = self._gripper_caging_reward(action, obj) in_place_and_object_grasped = reward_utils.hamacher_product(object_grasped, in_place) reward = in_place_and_object_grasped if tcp_to_obj < 0.02 and (tcp_opened > 0) and (obj[2] - 0.01 > self.obj_init_pos[2]): reward += 1. + 5. * in_place if obj_to_target < _TARGET_RADIUS: reward = 10. return [reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place]
def _reward_pos(wrench_center, target_pos): pos_error = target_pos - wrench_center radius = np.linalg.norm(pos_error[:2]) aligned = radius < 0.02 hooked = pos_error[2] > 0.0 success = aligned and hooked # Target height is a 3D funnel centered on the peg. # use the success flag to widen the bottleneck once the agent # learns to place the wrench on the peg -- no reason to encourage # tons of alignment accuracy if task is already solved threshold = 0.02 if success else 0.01 target_height = 0.0 if radius > threshold: target_height = 0.02 * np.log(radius - threshold) + 0.2 pos_error[2] = target_height - wrench_center[2] scale = np.array([1., 1., 3.]) a = 0.1 # Relative importance of just *trying* to lift the wrench b = 0.9 # Relative importance of placing the wrench on the peg lifted = wrench_center[2] > 0.02 or radius < threshold in_place = a * float(lifted) + b * reward_utils.tolerance( np.linalg.norm(pos_error * scale), bounds=(0, 0.02), margin=0.4, sigmoid='long_tail', ) return in_place, success
def compute_reward(self, action, obs): obj = obs[4:7] tcp_opened = obs[3] tcp_to_obj = np.linalg.norm(obj - self.tcp_center) target_to_obj = np.linalg.norm(obj - self._target_pos) target_to_obj_init = np.linalg.norm(self.obj_init_pos - self._target_pos) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=target_to_obj_init, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action, obj, self.OBJ_RADIUS) reward = reward_utils.hamacher_product(object_grasped, in_place) if (tcp_to_obj < 0.01) and (0 < tcp_opened < 0.55) and \ (target_to_obj_init - target_to_obj > 0.01): reward += 1. + 5. * in_place if target_to_obj < self.TARGET_RADIUS: reward = 10. return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = np.array([self._target_pos[0], self._target_pos[1], obj[2]]) obj_to_target = np.linalg.norm(obj - target) tcp_to_obj = np.linalg.norm(obj - tcp) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action, obj, self.OBJ_RADIUS) in_place_and_object_grasped = reward_utils.hamacher_product( object_grasped, in_place) reward = (2 * object_grasped) + (6 * in_place_and_object_grasped) if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place ]
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.12 tcp = self.tcp_center stick = obs[4:7] + np.array([.015, .0, .0]) container = obs[11:14] tcp_opened = obs[3] target = self._target_pos tcp_to_stick = np.linalg.norm(stick - tcp) stick_to_target = np.linalg.norm(stick - target) stick_in_place_margin = (np.linalg.norm(self.stick_init_pos - target)) - _TARGET_RADIUS stick_in_place = reward_utils.tolerance(stick_to_target, bounds=(0, _TARGET_RADIUS), margin=stick_in_place_margin, sigmoid='long_tail',) container_to_target = np.linalg.norm(container - target) container_in_place_margin = np.linalg.norm(self.obj_init_pos - target) - _TARGET_RADIUS container_in_place = reward_utils.tolerance(container_to_target, bounds=(0, _TARGET_RADIUS), margin=container_in_place_margin, sigmoid='long_tail',) object_grasped = self._gripper_caging_reward( action=action, obj_pos=stick, obj_radius=0.04, pad_success_thresh=0.05, object_reach_radius=0.01, xz_thresh=0.01, high_density=True ) reward = object_grasped if tcp_to_stick < 0.02 and (tcp_opened > 0) and \ (stick[2] - 0.01 > self.stick_init_pos[2]): object_grasped = 1 reward = 2. + 5. * stick_in_place + 3. * container_in_place if container_to_target <= _TARGET_RADIUS: reward = 10. return [reward, tcp_to_stick, tcp_opened, container_to_target, object_grasped, stick_in_place]
def compute_reward(self, action, obs): del action gripper = obs[:3] lock = obs[4:7] # Add offset to track gripper's shoulder, rather than fingers offset = np.array([.0, .055, .07]) scale = np.array([0.25, 1., 0.5]) shoulder_to_lock = (gripper + offset - lock) * scale shoulder_to_lock_init = ( self.init_tcp + offset - self.obj_init_pos ) * scale # This `ready_to_push` reward should be a *hint* for the agent, not an # end in itself. Make sure to devalue it compared to the value of # actually unlocking the lock ready_to_push = reward_utils.tolerance( np.linalg.norm(shoulder_to_lock), bounds=(0, 0.02), margin=np.linalg.norm(shoulder_to_lock_init), sigmoid='long_tail', ) obj_to_target = abs(self._target_pos[0] - lock[0]) pushed = reward_utils.tolerance( obj_to_target, bounds=(0, 0.005), margin=self._lock_length, sigmoid='long_tail', ) reward = 2 * ready_to_push + 8 * pushed return ( reward, np.linalg.norm(shoulder_to_lock), obs[3], obj_to_target, ready_to_push, pushed )
def compute_reward(self, action, obs): obj = obs[4:7] tcp = self.tcp_center target = self._target_pos.copy() target_to_obj = (obj - target) target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self.obj_init_pos - target) target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=abs(target_to_obj_init - self.TARGET_RADIUS), sigmoid='long_tail', ) handle_reach_radius = 0.005 tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(self.obj_init_pos - self.init_tcp) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, handle_reach_radius), margin=abs(tcp_to_obj_init - handle_reach_radius), sigmoid='gaussian', ) gripper_closed = min(max(0, action[-1]), 1) reach = reward_utils.hamacher_product(reach, gripper_closed) tcp_opened = 0 object_grasped = reach reward = reward_utils.hamacher_product(reach, in_place) if target_to_obj <= self.TARGET_RADIUS + 0.015: reward = 1. reward *= 10 return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): obj = self._get_pos_objects() dial_push_position = self._get_pos_objects() + np.array([0.05, 0.02, 0.09]) tcp = self.tcp_center target = self._target_pos.copy() target_to_obj = (obj - target) target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self.dial_push_position - target) target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=abs(target_to_obj_init - self.TARGET_RADIUS), sigmoid='long_tail', ) dial_reach_radius = 0.005 tcp_to_obj = np.linalg.norm(dial_push_position - tcp) tcp_to_obj_init = np.linalg.norm(self.dial_push_position - self.init_tcp) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, dial_reach_radius), margin=abs(tcp_to_obj_init-dial_reach_radius), sigmoid='gaussian', ) gripper_closed = min(max(0, action[-1]), 1) reach = reward_utils.hamacher_product(reach, gripper_closed) tcp_opened = 0 object_grasped = reach reward = 10 * reward_utils.hamacher_product(reach, in_place) return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def _reward_pos(obs, target_pos): hand = obs[:3] lid = obs[4:7] + np.array([.0, .0, .02]) threshold = 0.02 # floor is a 3D funnel centered on the lid's handle radius = np.linalg.norm(hand[:2] - lid[:2]) if radius <= threshold: floor = 0.0 else: floor = 0.04 * np.log(radius - threshold) + 0.4 # prevent the hand from running into the handle prematurely by keeping # it above the "floor" above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance( floor - hand[2], bounds=(0.0, 0.01), margin=floor / 2.0, sigmoid='long_tail', ) # grab the lid's handle in_place = reward_utils.tolerance( np.linalg.norm(hand - lid), bounds=(0, 0.02), margin=0.5, sigmoid='long_tail', ) ready_to_lift = reward_utils.hamacher_product(above_floor, in_place) # now actually put the lid on the box pos_error = target_pos - lid error_scale = np.array([1., 1., 3.]) # Emphasize Z error a = 0.2 # Relative importance of just *trying* to lift the lid at all b = 0.8 # Relative importance of placing the lid on the box lifted = a * float(lid[2] > 0.04) + b * reward_utils.tolerance( np.linalg.norm(pos_error * error_scale), bounds=(0, 0.05), margin=0.25, sigmoid='long_tail', ) return ready_to_lift, lifted
def compute_reward(self, action, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos obj_to_target = np.linalg.norm(obj - target) tcp_to_obj = np.linalg.norm(obj - tcp) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail', ) object_grasped = self._gripper_caging_reward(action=action, obj_pos=obj, obj_radius=0.02, pad_success_thresh=0.05, object_reach_radius=0.01, xz_thresh=0.01, high_density=False) reward = reward_utils.hamacher_product(object_grasped, in_place) if (0.0 < obj[2] < 0.24 and \ (target[0]-0.15 < obj[0] < target[0]+0.15) and \ ((target[1] - 3*_TARGET_RADIUS) < obj[1] < target[1])): z_scaling = (0.24 - obj[2]) / 0.24 y_scaling = (obj[1] - (target[1] - 3 * _TARGET_RADIUS)) / (3 * _TARGET_RADIUS) bound_loss = reward_utils.hamacher_product(y_scaling, z_scaling) in_place = np.clip(in_place - bound_loss, 0.0, 1.0) if ((0.0 < obj[2] < 0.24) and \ (target[0]-0.15 < obj[0] < target[0]+0.15) and \ (obj[1] > target[1])): in_place = 0.0 if tcp_to_obj < 0.025 and (tcp_opened > 0) and \ (obj[2] - 0.01 > self.obj_init_pos[2]): reward += 1. + 5. * in_place if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place ]
def _reward_pos(obs, theta): hand = obs[:3] door = obs[4:7] + np.array([-0.05, 0, 0]) threshold = 0.12 # floor is a 3D funnel centered on the door handle radius = np.linalg.norm(hand[:2] - door[:2]) if radius <= threshold: floor = 0.0 else: floor = 0.04 * np.log(radius - threshold) + 0.4 # prevent the hand from running into the handle prematurely by keeping # it above the "floor" above_floor = 1.0 if hand[2] >= floor else reward_utils.tolerance( floor - hand[2], bounds=(0.0, 0.01), margin=floor / 2.0, sigmoid='long_tail', ) # move the hand to a position between the handle and the main door body in_place = reward_utils.tolerance( np.linalg.norm(hand - door - np.array([0.05, 0.03, -0.01])), bounds=(0, threshold / 2.0), margin=0.5, sigmoid='long_tail', ) ready_to_open = reward_utils.hamacher_product(above_floor, in_place) # now actually open the door door_angle = -theta a = 0.2 # Relative importance of just *trying* to open the door at all b = 0.8 # Relative importance of fully opening the door opened = a * float(theta < -np.pi / 90.) + b * reward_utils.tolerance( np.pi / 2. + np.pi / 6 - door_angle, bounds=(0, 0.5), margin=np.pi / 3., sigmoid='long_tail', ) return ready_to_open, opened
def compute_reward(self, action, obs): del action obj = obs[4:7] tcp = self.tcp_center tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(obj - self.init_tcp) obj_to_target = abs(self._target_pos[1] - obj[1]) near_button = reward_utils.tolerance( tcp_to_obj, bounds=(0, 0.01), margin=tcp_to_obj_init, sigmoid='long_tail', ) button_pressed = reward_utils.tolerance( obj_to_target, bounds=(0, 0.005), margin=self._obj_to_target_init, sigmoid='long_tail', ) reward = 0.0 if tcp_to_obj > 0.07: tcp_status = (1 - obs[3]) / 2.0 reward = 2 * reward_utils.hamacher_product(tcp_status, near_button) else: reward = 2 reward += 2 * (1 + obs[3]) reward += 4 * button_pressed ** 2 return ( reward, tcp_to_obj, obs[3], obj_to_target, near_button, button_pressed )
def compute_reward(self, action, obs): del action obj = obs[4:7] + np.array([-.04, .0, .03]) tcp = self.tcp_center target = self._target_pos.copy() target_to_obj = (obj - target) target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self.obj_init_pos - target) target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self._target_radius), margin=abs(target_to_obj_init - self._target_radius), sigmoid='long_tail', ) faucet_reach_radius = 0.01 tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(self.obj_init_pos - self.init_tcp) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, faucet_reach_radius), margin=abs(tcp_to_obj_init - faucet_reach_radius), sigmoid='gaussian', ) tcp_opened = 0 object_grasped = reach reward = 2 * reach + 3 * in_place reward *= 2 reward = 10 if target_to_obj <= self._target_radius else reward return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, actions, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos obj_to_target = np.linalg.norm(obj - target) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin - _TARGET_RADIUS, sigmoid='long_tail', ) tcp_to_obj = np.linalg.norm(tcp - obj) obj_grasped_margin = np.linalg.norm(self.init_tcp - self.obj_init_pos) object_grasped = reward_utils.tolerance( tcp_to_obj, bounds=(0, _TARGET_RADIUS), margin=obj_grasped_margin - _TARGET_RADIUS, sigmoid='long_tail', ) in_place_and_object_grasped = reward_utils.hamacher_product( object_grasped, in_place) reward = 1.5 * object_grasped if tcp[2] <= 0.03 and tcp_to_obj < 0.07: reward = 2 + (7 * in_place) if obj_to_target < _TARGET_RADIUS: reward = 10. return [ reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place ]
def _reward_pos(hammer_head, target_pos): pos_error = target_pos - hammer_head a = 0.1 # Relative importance of just *trying* to lift the hammer b = 0.9 # Relative importance of hitting the nail lifted = hammer_head[2] > 0.02 in_place = a * float(lifted) + b * reward_utils.tolerance( np.linalg.norm(pos_error), bounds=(0, 0.02), margin=0.2, sigmoid='long_tail', ) return in_place
def _reward_pos(wrench_center, target_pos): pos_error = target_pos + np.array([.0, .0, .1]) - wrench_center a = 0.1 # Relative importance of just *trying* to lift the wrench b = 0.9 # Relative importance of placing the wrench on the peg lifted = wrench_center[2] > 0.02 in_place = a * float(lifted) + b * reward_utils.tolerance( np.linalg.norm(pos_error), bounds=(0, 0.02), margin=0.2, sigmoid='long_tail', ) return in_place
def compute_reward(self, actions, obs): del actions objPos = obs[4:7] obj = self._get_pos_objects() tcp = self.tcp_center target = self._target_pos.copy() target_to_obj = (obj[2] - target[2]) target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self._handle_init_pos[2] - target[2]) target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, self.TARGET_RADIUS), margin=abs(target_to_obj_init - self.TARGET_RADIUS), sigmoid='long_tail', ) handle_radius = 0.02 tcp_to_obj = np.linalg.norm(obj - tcp) tcp_to_obj_init = np.linalg.norm(self._handle_init_pos - self.init_tcp) reach = reward_utils.tolerance( tcp_to_obj, bounds=(0, handle_radius), margin=abs(tcp_to_obj_init - handle_radius), sigmoid='long_tail', ) tcp_opened = 0 object_grasped = reach reward = reward_utils.hamacher_product(reach, in_place) reward = 1 if target_to_obj <= self.TARGET_RADIUS else reward reward *= 10 return (reward, tcp_to_obj, tcp_opened, target_to_obj, object_grasped, in_place)
def compute_reward(self, action, obs): del action obj = obs[4:7] tcp = self.get_body_com('leftpad') scale = np.array([0.25, 1., 0.5]) tcp_to_obj = np.linalg.norm((obj - tcp) * scale) tcp_to_obj_init = np.linalg.norm((obj - self.init_left_pad) * scale) obj_to_target = abs(self._target_pos[2] - obj[2]) tcp_opened = max(obs[3], 0.0) near_lock = reward_utils.tolerance( tcp_to_obj, bounds=(0, 0.01), margin=tcp_to_obj_init, sigmoid='long_tail', ) lock_pressed = reward_utils.tolerance( obj_to_target, bounds=(0, 0.005), margin=self._lock_length, sigmoid='long_tail', ) reward = 2 * reward_utils.hamacher_product(tcp_opened, near_lock) reward += 8 * lock_pressed return ( reward, tcp_to_obj, obs[3], obj_to_target, near_lock, lock_pressed )
def compute_reward(self, actions, obs): _TARGET_RADIUS = 0.05 tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos tcp_to_target = np.linalg.norm(tcp - target) obj_to_target = np.linalg.norm(obj - target) in_place_margin = (np.linalg.norm(self.hand_init_pos - target)) in_place = reward_utils.tolerance(tcp_to_target, bounds=(0, _TARGET_RADIUS), margin=in_place_margin, sigmoid='long_tail',) return [10 * in_place, tcp_to_target, in_place]
def compute_reward(self, action, obs): obj = obs[4:7] target = self._target_pos.copy() # Emphasize X and Y errors scale = np.array([2., 2., 1.]) target_to_obj = (obj - target) * scale target_to_obj = np.linalg.norm(target_to_obj) target_to_obj_init = (self.obj_init_pos - target) * scale target_to_obj_init = np.linalg.norm(target_to_obj_init) in_place = reward_utils.tolerance( target_to_obj, bounds=(0, 0.05), margin=target_to_obj_init, sigmoid='long_tail', ) tcp_opened = obs[3] tcp_to_obj = np.linalg.norm(obj - self.tcp_center) object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=0.04, obj_radius=0.02, pad_success_thresh=0.05, xz_thresh=0.05, desired_gripper_effort=0.7, medium_density=True ) reward = reward_utils.hamacher_product(object_grasped, in_place) if tcp_to_obj < 0.04 and tcp_opened > 0: reward += 1. + 5. * in_place if target_to_obj < 0.05: reward = 10. return ( reward, tcp_to_obj, tcp_opened, np.linalg.norm(obj - target), # recompute to avoid `scale` above object_grasped, in_place )
def compute_reward(self, action, obs): tcp = self.tcp_center obj = obs[4:7] tcp_opened = obs[3] target = self._target_pos tcp_to_obj = np.linalg.norm(obj - tcp) obj_to_target = np.linalg.norm(obj - target) pad_success_margin = 0.05 object_reach_radius = 0.01 x_z_margin = 0.005 obj_radius = 0.025 object_grasped = self._gripper_caging_reward( action, obj, object_reach_radius=object_reach_radius, obj_radius=obj_radius, pad_success_thresh=pad_success_margin, xz_thresh=x_z_margin, desired_gripper_effort=0.8, high_density=True) in_place_margin = np.linalg.norm(self.obj_init_pos - target) in_place = reward_utils.tolerance( obj_to_target, bounds=(0, 0.05), margin=in_place_margin, sigmoid='long_tail', ) grasp_success = (tcp_opened > 0.5 and (obj[0] - self.obj_init_pos[0] > 0.015)) reward = 2 * object_grasped if grasp_success and tcp_to_obj < 0.035: reward = 1 + 2 * object_grasped + 5 * in_place if obj_to_target <= 0.05: reward = 10. return reward, tcp_to_obj, tcp_opened, obj_to_target, object_grasped, in_place, float( grasp_success)