def _compute_reward(self): reward = np.float32(0.0) robot_obs, _ = self._robot.get_observation() world_obs, _ = self._world.get_observation() d1 = goal_distance(np.array(robot_obs[:3]), np.array(world_obs[:3])) d2 = goal_distance(np.array(world_obs[:3]), np.array(self._tg_pose)) if self._reward_type is 0: reward = -d1 - d2 if d2 <= self._target_dist_min: reward += np.float32(1000.0) # normalized reward elif self._reward_type is 1: rew1 = 0.125 rew2 = 0.25 if d1 > 0.1: reward = rew1 * (1 - d1 / self._init_dist_hand_obj) # print("reward 1 ", reward) else: reward = rew1 * (1 - d1 / self._init_dist_hand_obj) + rew2 * ( 1 - d2 / self._max_dist_obj_tg) # print("reward 2 ", reward) if d2 <= self._target_dist_min: reward += np.float32(1000.0) return reward
def _compute_reward(self): robot_obs, _ = self._robot.get_observation() world_obs, _ = self._world.get_observation() d1 = goal_distance(np.array(robot_obs[:3]), np.array(world_obs[:3])) d2 = goal_distance(np.array(world_obs[:3]), np.array(self._target_pose)) reward = -d1 - d2 # print("--------") # print(reward) # print("--------") if d2 <= self._target_dist_min: reward = np.float32(1000.0) + (100 - d2 * 80) return reward
def reset(self): self.reset_simulation() # --- sample target pose --- # world_obs, _ = self._world.get_observation() self._tg_pose = self.sample_tg_pose(world_obs[:3]) self.debug_gui() robot_obs, _ = self._robot.get_observation() world_obs, _ = self._world.get_observation() self._init_dist_hand_obj = goal_distance(np.array(robot_obs[:3]), np.array(world_obs[:3])) self._max_dist_obj_tg = goal_distance(np.array(world_obs[:3]), np.array(self._tg_pose)) obs = self.get_goal_observation() scaled_obs = scale_gym_data(self.observation_space['observation'], obs['observation']) obs['observation'] = scaled_obs return obs
def _compute_reward(self): robot_obs, _ = self._robot.get_observation() world_obs, _ = self._world.get_observation() d = goal_distance(np.array(robot_obs[:3]), np.array(world_obs[:3])) reward = -d if d <= self._target_dist_min: reward = np.float32(1000.0) + (100 - d*80) return reward
def _termination(self): world_obs, _ = self._world.get_observation() d = goal_distance(np.array(world_obs[:3]), np.array(self._target_pose)) if d <= self._target_dist_min: self.terminated = 1 print('------------->>> success!') print('final reward') print(self._compute_reward()) return np.float32(1.0) if self.terminated or self._env_step_counter > self._max_steps: return np.float32(1.0) return np.float32(0.0)
def compute_reward(self, achieved_goal, goal, info): # Compute distance between goal and the achieved goal. d = goal_distance(achieved_goal[:3], goal[:3]) return -(d > self._target_dist_min).astype(np.float32)
def _is_success(self, achieved_goal, goal): # Compute distance between goal and the achieved goal. d = goal_distance(achieved_goal[:3], goal[:3]) return d <= self._target_dist_min