class BaodingEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__( self, n_jnt=24, frame_skip=40, ): #init vars self.which_task = Task(WHICH_TASK) self.time = 0 self.counter = 0 self.grasp_sid = 0 self.object1_bid = 0 self.object2_bid = 0 self.target1_sid = 0 self.target2_sid = 0 self.new_task_sid = 0 self.obs_dict = {} self.reward_dict = {} self.frame_skip = frame_skip self.skip = frame_skip # dims assert n_jnt > 0 self.n_jnt = n_jnt self.n_obj_dofs = 2 * 7 # two free joints, each w 7 dof (3 pos, 4 quat) self.n_dofs = self.n_jnt + self.n_obj_dofs # xml file paths, based on task if self.which_task == Task.MOVE_TO_LOCATION: self.xml_path = os.path.join(os.path.dirname(__file__), 'assets', 'shadowhand_baoding_1visible.xml') else: self.xml_path = os.path.join(os.path.dirname(__file__), 'assets', 'shadowhand_baoding_2.xml') # init desired trajectory, for baoding self.x_radius = 0.02 self.y_radius = 0.02 * 1.5 * 1.2 self.center_pos = [0.005, 0.055] # balls start at these angles # 1= yellow = bottom right # 2= pink = top left self.ball_1_starting_angle = 3 * np.pi / 4.0 self.ball_2_starting_angle = -np.pi / 4.0 # Make robot self.robot = Robot( n_jnt=self.n_jnt, n_obj=self.n_obj_dofs, n_dofs=self.n_dofs, pos_bounds=[[-40, 40]] * self.n_dofs, #dummy vel_bounds=[[-3, 3]] * self.n_dofs, ) # Initialize mujoco env self.startup = True self.initializing = True super().__init__( self.xml_path, frame_skip=frame_skip, camera_settings=dict( distance=0.7, azimuth=-60, elevation=-50, ), ) utils.EzPickle.__init__(self) self.startup = False self.initializing = False #init target and body sites self.grasp_sid = self.sim.model.site_name2id('S_grasp') self.object1_bid = self.sim.model.body_name2id('ball1') self.object2_bid = self.sim.model.body_name2id('ball2') self.target1_sid = self.model.site_name2id('target1_site') self.target2_sid = self.model.site_name2id('target2_site') if self.which_task == Task.MOVE_TO_LOCATION: self.new_task_sid = self.model.site_name2id('new_task_site') #reset position self.init_qpos = self.sim.model.key_qpos[0].copy() #action ranges self.act_mid = self.init_qpos[:self.n_jnt].copy() self.upper_rng = 0.9 * (self.model.actuator_ctrlrange[:, 1] - self.act_mid) self.lower_rng = 0.9 * (self.act_mid - self.model.actuator_ctrlrange[:, 0]) #indices if not self.which_task == Task.MOVE_TO_LOCATION: self.duplicateData_switchObjs = True self.objInfo_start1 = self.n_jnt self.objInfo_start2 = self.n_jnt + 3 + 3 self.targetInfo_start1 = -4 self.targetInfo_start2 = -2 #ball weight self.domain_low = 0.026 self.domain_high = 0.027 self.test_domain = 0.07 self.xml_location1 = os.path.join(os.path.dirname(__file__), 'assets', 'baoding_ball_1.xml') self.xml_location2 = os.path.join(os.path.dirname(__file__), 'assets', 'baoding_ball_2.xml') self.mode = 'train' def get_reward(self, observations, actions): """get rewards of a given (observations, actions) pair Args: observations: (batchsize, obs_dim) or (obs_dim,) actions: (batchsize, ac_dim) or (ac_dim,) Return: r_total: reward for that pair: (batchsize,1) or (1,) done: True if env reaches terminal state: (batchsize,1) or (1,) """ #initialize and reshape as needed, for batch mode self.reward_dict = {} if len(observations.shape) == 1: observations = np.expand_dims(observations, axis=0) actions = np.expand_dims(actions, axis=0) batch_mode = False else: batch_mode = True # obs: # self.obs_dict['robot_pos'], #24 # self.obs_dict['object1_pos'], #3 # self.obs_dict['object1_velp'], #3 # self.obs_dict['object2_pos'], #3 # self.obs_dict['object2_velp'], #3 # self.obs_dict['target1_pos'], #2 # self.obs_dict['target2_pos'] #2 #get vars joint_pos = observations[:, :self.n_jnt] wrist_angle = observations[:, 1] obj1_pos = observations[:, self.n_jnt:self.n_jnt + 3] obj2_pos = observations[:, self.n_jnt + 6:self.n_jnt + 6 + 3] target1_pos = observations[:, -4:-2] target2_pos = observations[:, -2:] zeros = np.zeros(wrist_angle.shape) # position difference from the desired pos_dist_1 = np.linalg.norm(obj1_pos[:, :2] - target1_pos, axis=1) pos_dist_2 = np.linalg.norm(obj2_pos[:, :2] - target2_pos, axis=1) self.reward_dict['pos_dist_1'] = -pos_dist_1 self.reward_dict['pos_dist_2'] = -pos_dist_2 #height is_fall_1 = zeros is_fall_2 = zeros if not self.startup: height_threshold = 0.06 is_fall_1[obj1_pos[:, 2] < height_threshold] = 1 is_fall_2[obj2_pos[:, 2] < height_threshold] = 1 if self.which_task == Task.MOVE_TO_LOCATION: is_fall = is_fall_1 #only care about ball #1 self.reward_dict['drop_penalty'] = -500 * is_fall else: is_fall = np.logical_or(is_fall_1, is_fall_2) #keep both balls up self.reward_dict['drop_penalty'] = -500 * is_fall #done based on is_fall dones = (is_fall == 1) if not self.startup else zeros #penalize wrist angle for lifting up (positive) too much wrist_threshold = 0.15 wrist_too_high = zeros wrist_too_high[wrist_angle > wrist_threshold] = 1 self.reward_dict['wrist_angle'] = -10 * wrist_too_high #total rewards if self.which_task == Task.MOVE_TO_LOCATION: self.reward_dict['r_total'] = 5 * self.reward_dict[ 'pos_dist_1'] + self.reward_dict['drop_penalty'] else: self.reward_dict['r_total'] = 5 * self.reward_dict[ 'pos_dist_1'] + 5 * self.reward_dict[ 'pos_dist_2'] + self.reward_dict[ 'drop_penalty'] + self.reward_dict['wrist_angle'] #return if not batch_mode: return self.reward_dict['r_total'][0], dones[0] return self.reward_dict['r_total'], dones def get_score(self, obs): if self.startup: score = 0 else: #get vars joint_pos = obs[:self.n_jnt] obj1_pos = obs[self.n_jnt:self.n_jnt + 3] obj2_pos = obs[self.n_jnt + 6:self.n_jnt + 6 + 3] target1_pos = obs[-4:-2] target2_pos = obs[-2:] if self.which_task == Task.MOVE_TO_LOCATION: score = -np.linalg.norm(obj1_pos[:2] - target1_pos) else: pos_diff_1 = (obj1_pos[:2] - target1_pos) pos_diff_2 = (obj2_pos[:2] - target2_pos) pos_dist_1 = np.linalg.norm(pos_diff_1) pos_dist_2 = np.linalg.norm(pos_diff_2) score = -pos_dist_1 - pos_dist_2 return score def step(self, a): # set the goal as the next entry of self.goal if self.startup: if self.which_task == Task.MOVE_TO_LOCATION: self.desired_pose = np.array([0, 0, 0]) else: self.desired_pose = np.array([0, 0]) else: self.desired_pose = self.goal[self.counter].copy() if self.which_task == Task.MOVE_TO_LOCATION: if not self.startup: #move the target to right location (visualization) self.model.site_pos[self.new_task_sid, 0] = self.desired_pose[0] self.model.site_pos[self.new_task_sid, 1] = self.desired_pose[1] self.model.site_pos[self.new_task_sid, 2] = self.desired_pose[2] + 0.02 #move the baoding targets out of the way self.model.site_pos[self.target1_sid, 0] = 0 self.model.site_pos[self.target1_sid, 2] = 0.05 self.model.site_pos[self.target2_sid, 0] = 0 self.model.site_pos[self.target2_sid, 2] = 0.05 self.model.site_pos[self.target1_sid, 1] = 0 self.model.site_pos[self.target2_sid, 1] = 0 else: #set target position for visualization desired_angle_wrt_palm = np.array([0, 0]) #shift angle by the starting ball position if not self.startup: desired_angle_wrt_palm = self.desired_pose.copy() desired_angle_wrt_palm[ 0] = desired_angle_wrt_palm[0] + self.ball_1_starting_angle desired_angle_wrt_palm[ 1] = desired_angle_wrt_palm[1] + self.ball_2_starting_angle #palm is x right, y down, z up #angle convention (world frame) is x right, y up, z out #(so switch our y for palm z) desired_positions_wrt_palm = [0, 0, 0, 0] desired_positions_wrt_palm[0] = self.x_radius * np.cos( desired_angle_wrt_palm[0]) + self.center_pos[0] desired_positions_wrt_palm[1] = self.y_radius * np.sin( desired_angle_wrt_palm[0]) + self.center_pos[1] desired_positions_wrt_palm[2] = self.x_radius * np.cos( desired_angle_wrt_palm[1]) + self.center_pos[0] desired_positions_wrt_palm[3] = self.y_radius * np.sin( desired_angle_wrt_palm[1]) + self.center_pos[1] if not self.startup: #self.model.site_pos is in the palm frame #self.data.site_xpos is in the world frame (populated after a forward call) self.model.site_pos[self.target1_sid, 0] = desired_positions_wrt_palm[0] self.model.site_pos[self.target1_sid, 2] = desired_positions_wrt_palm[1] self.model.site_pos[self.target2_sid, 0] = desired_positions_wrt_palm[2] self.model.site_pos[self.target2_sid, 2] = desired_positions_wrt_palm[3] #move upward, to be seen self.model.site_pos[self.target1_sid, 1] = -0.07 self.model.site_pos[self.target2_sid, 1] = -0.07 # clip and scale action a = np.clip(a, -1.0, 1.0) if self.startup: a = a else: # mean center and scale : action = self.act_mid + a*self.act_rng a[a > 0] = self.act_mid[a > 0] + a[a > 0] * self.upper_rng[a > 0] a[a <= 0] = self.act_mid[a <= 0] + a[a <= 0] * self.lower_rng[a <= 0] # take the action self.robot.step(self, a, step_duration=self.skip * self.model.opt.timestep) self.counter += 1 # get obs/rew/done/score obs = self._get_obs() reward, done = self.get_reward(obs, a) catastrophe = done obs = np.concatenate((obs, [float(catastrophe)])) score = self.get_score(obs) env_info = { 'time': self.time, 'obs_dict': self.obs_dict, 'rewards': self.reward_dict, 'score': score } return obs, reward, done, env_info def _get_obs(self): self.robot.get_obs(self, robot_noise_ratio=0, object_noise_ratio=0) t, qp, qv, qp_obj, qv_obj = self.robot.get_obs_from_cache(self, -1) self.time = t # hand joint positions self.obs_dict = {} self.obs_dict['robot_pos'] = qp.copy() # object positions self.obs_dict['object1_pos'] = qp_obj[:3] self.obs_dict['object2_pos'] = qp_obj[3 + 4:3 + 4 + 3] # object translational velocities self.obs_dict['object1_velp'] = qv_obj[:3] self.obs_dict['object2_velp'] = qv_obj[3 + 4:3 + 4 + 3] # site locations in world frame, populated after the step/forward call if self.which_task == Task.MOVE_TO_LOCATION: target_1_sid_use = self.new_task_sid else: target_1_sid_use = self.target1_sid self.obs_dict['target1_pos'] = np.array([ self.data.site_xpos[target_1_sid_use][0], self.data.site_xpos[target_1_sid_use][1] ]) self.obs_dict['target2_pos'] = np.array([ self.data.site_xpos[self.target2_sid][0], self.data.site_xpos[self.target2_sid][1] ]) return np.concatenate([ self.obs_dict['robot_pos'], #24 self.obs_dict['object1_pos'], #3 self.obs_dict['object1_velp'], #3 self.obs_dict['object2_pos'], #3 self.obs_dict['object2_velp'], #3 self.obs_dict['target1_pos'], #2 self.obs_dict['target2_pos'] #2 ]) def reset_model(self, mode="train"): self.reset_pose = self.init_qpos.copy() self.reset_vel = self.init_qvel.copy() self.reset_goal = self.create_goal_trajectory() return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal, mode=mode) def set_size(self, weight): lock = FileLock(self.xml_path + '.lock') # concurrency protection def modify_ball_xml(ball_xml_file): et = xml.etree.ElementTree.parse(ball_xml_file) et.find('body').find('geom').set('size', "%0.5f" % weight) # changing ball weight et.write(ball_xml_file) with lock: modify_ball_xml(self.xml_location1) modify_ball_xml(self.xml_location2) self.sim_robot = MujocoSimRobot(self.xml_path, camera_settings=dict( distance=0.7, azimuth=-60, elevation=-50, )) self.sim = self.sim_robot.sim self.model = self.sim_robot.model self.data = self.sim_robot.data if self.sim_robot.renderer._onscreen_renderer: import ipdb ipdb.set_trace() self.close() self.render() def set_weight(self, weight): lock = FileLock(self.xml_path + '.lock') # concurrency protection def modify_ball_xml(ball_xml_file): et = xml.etree.ElementTree.parse(ball_xml_file) et.find('body').find('geom').set('mass', "%0.3f" % weight) # changing ball weight et.write(ball_xml_file) with lock: modify_ball_xml(self.xml_location1) modify_ball_xml(self.xml_location2) self.sim_robot = MujocoSimRobot(self.xml_path, camera_settings=dict( distance=0.7, azimuth=-60, elevation=-50, )) self.sim = self.sim_robot.sim self.model = self.sim_robot.model self.data = self.sim_robot.data self.close() if self.sim_robot.renderer._onscreen_renderer: import ipdb ipdb.set_trace() self.render() def do_reset(self, reset_pose, reset_vel, reset_goal=None, mode="train"): if mode == 'train': self.ball_weights = np.random.uniform(self.domain_low, self.domain_high) print(self.ball_weights) #self.set_weight(self.ball_weights) self.set_size(self.ball_weights) elif self.mode != 'test' and mode == 'test': #starting adaptation self.ball_weights = self.test_domain self.mode = mode #self.set_weight(self.test_domain) self.set_size(self.test_domain) #### reset counters self.counter = 0 self.doesntSee1 = 0 self.doesntSee2 = 0 #### reset goal if reset_goal is None: self.goal = self.create_goal_trajectory() else: self.goal = reset_goal.copy() #### reset hand and objects self.robot.reset(self, reset_pose, reset_vel) self.sim.forward() return np.concatenate((self._get_obs(), [0.])) def create_goal_trajectory(self): len_of_goals = 1000 # populate go-to task with a target location if self.which_task == Task.MOVE_TO_LOCATION: goal_pos = np.random.randint(4) desired_position = [] if goal_pos == 0: desired_position.append(0.01) #x desired_position.append(0.04) #y desired_position.append(0.2) #z elif goal_pos == 1: desired_position.append(0) desired_position.append(-0.06) desired_position.append(0.24) elif goal_pos == 2: desired_position.append(-0.02) desired_position.append(-0.02) desired_position.append(0.2) else: desired_position.append(0.03) desired_position.append(-0.02) desired_position.append(0.2) goal_traj = np.tile(desired_position, (len_of_goals, 1)) # populate baoding task with a trajectory of goals to hit else: reward_option = 0 period = 60 goal_traj = [] if self.which_task == Task.BAODING_CW: sign = -1 if self.which_task == Task.BAODING_CCW: sign = 1 ### Reward option: continuous circle if reward_option == 0: t = 0 while t < len_of_goals: angle_before_shift = sign * 2 * np.pi * (t / period) goal_traj.append( np.array([angle_before_shift, angle_before_shift])) t += 1 #### Reward option: increment in fourths else: angle_before_shift = 0 t = 0 while t < len_of_goals: if (t > 0 and t % 15 == 0): angle_before_shift += np.pi / 2.0 goal_traj.append( np.array([angle_before_shift, angle_before_shift])) t += 1 goal_traj = np.array(goal_traj) return goal_traj
class DClawTurnEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self, n_jnt=9, n_obj=1, frame_skip=40): #init vars self.target_bid = 0 self.object_qp = 0 # Object joint angle self.object_qp_init = 0 self.object_qv_init = 0 self.goal = [0] # target joint angle self.obs_dict = {} self.rewards_dict = {} self.time = 0 assert n_jnt > 0 assert n_obj >= 0 self.n_jnt = n_jnt self.n_obj = n_obj self.n_dofs = n_jnt + n_obj xml_path = os.path.join(os.path.dirname(__file__), 'assets', 'dclaw_valve3.xml') # Make robot self.robot = Robot( n_jnt=n_jnt, n_obj=n_obj, n_dofs=self.n_dofs, pos_bounds=[[-np.pi / 2.0, np.pi / 2.0]] * 10, vel_bounds=[[-5, 5]] * 10, ) # Initialize mujoco env self.initializing = True super().__init__( xml_path, frame_skip=frame_skip, camera_settings=dict( distance=1.0, azimuth=140, elevation=-50, ), ) utils.EzPickle.__init__(self) self.initializing = False self.skip = self.frame_skip # init target and action ranges self.target_bid = self.sim.model.body_name2id("target") self.act_mid = np.mean(self.model.actuator_ctrlrange, axis=1) self.act_rng = 0.5 * (self.model.actuator_ctrlrange[:self.n_jnt, 1] - self.model.actuator_ctrlrange[:self.n_jnt, 0]) # starting pose of dclaw self.init_qpos[[0, 3, 6]] = 0 self.init_qpos[[1, 4, 7]] = -1.0 self.init_qpos[[2, 5, 8]] = 1.35 def get_reward(self, observations, actions): """get rewards of a given (observations, actions) pair Args: observations: (batchsize, obs_dim) or (obs_dim,) actions: (batchsize, ac_dim) or (ac_dim,) Return: r_total: done: True if env reaches terminal state (batchsize,1) or (1,) """ #initialize and reshape as needed, for batch mode self.reward_dict = {} if len(observations.shape) == 1: observations = np.expand_dims(observations, axis=0) actions = np.expand_dims(actions, axis=0) batch_mode = False else: batch_mode = True #get vars target_pos = observations[:, -1] screw_pos = observations[:, -3] joint_pos = observations[:, :self.n_jnt] joint_vel = observations[:, self.n_jnt:2 * self.n_jnt] zeros = np.zeros(target_pos.shape) dones = zeros.copy() #this task is never terminated # screw position screw_dist = np.abs(angle_difference(screw_pos, target_pos)) # reward for proximity to the target self.reward_dict['r_target_dist'] = -10 * screw_dist bonus_small = zeros bonus_big = zeros bonus_small[screw_dist < 0.25] = 1 bonus_big[screw_dist < 0.1] = 10 self.reward_dict['bonus_small'] = bonus_small self.reward_dict['bonus_big'] = bonus_big # total reward self.reward_dict['r_total'] = self.reward_dict['r_target_dist'] + \ self.reward_dict['bonus_small'] + \ self.reward_dict['bonus_big'] #return if not batch_mode: return self.reward_dict['r_total'][0], dones[0] return self.reward_dict['r_total'], dones def get_score(self, obs): target_pos = obs[-1] screw_pos = obs[-3] score = -1 * np.abs(angle_difference(screw_pos, target_pos)) return score def step(self, a): a = np.clip(a, -1.0, 1.0) # Clamp the actions to limits if not self.initializing: a = self.act_mid + a * self.act_rng # mean center and scale # apply actions and step self.robot.step(self, a[:self.n_jnt], step_duration=self.frame_skip * self.model.opt.timestep) # obs/rew/done/score obs = self._get_obs() reward, done = self.get_reward(obs, a) score = self.get_score(obs) # finalize step env_info = { 'time': self.time, 'obs_dict': self.obs_dict, 'rewards': self.reward_dict, 'score': score } return obs, reward, done, env_info def _get_obs(self): # get obs self.robot.get_obs(self, robot_noise_ratio=0, object_noise_ratio=0) time, hand_qp, hand_qv, lid_qp, lid_qv = self.robot.get_obs_from_cache( self, -1) self.time = time # update target pose desired_orien = np.zeros(3) desired_orien[2] = self.goal[0] # update target visualization self.model.body_quat[self.target_bid] = euler2quat(desired_orien) # populate obs dictionary self.obs_dict = {} self.obs_dict['hand_qp'] = hand_qp self.obs_dict['hand_qv'] = hand_qv self.obs_dict['lid_qp'] = lid_qp self.obs_dict['lid_qv'] = lid_qv self.obs_dict['goal'] = self.goal return np.concatenate([ self.obs_dict['hand_qp'], self.obs_dict['hand_qv'], self.obs_dict['lid_qp'], self.obs_dict['lid_qv'], self.obs_dict['goal'] ]) def reset_model(self): # valve target angle rand_angle = self.np_random.uniform(low=np.pi / 2, high=np.pi) rand_sign = np.random.randint(2) if rand_sign == 0: self.goal = [rand_angle] else: self.goal = [-rand_angle] # valve start angle self.object_qp = self.np_random.uniform(low=-np.pi / 4, high=np.pi / 4) #reset the joints and screw to set location/velocity self.reset_pose = np.append(self.init_qpos[:self.n_jnt], self.object_qp) self.reset_vel = np.append(self.init_qvel[:self.n_jnt], self.object_qv_init) self.reset_goal = self.goal.copy() return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal) def do_reset(self, reset_pose, reset_vel, reset_goal): #reset hand and object self.robot.reset(self, reset_pose, reset_vel) self.sim.forward() #reset target self.goal = reset_goal.copy() #return return self._get_obs()
class CubeEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self, n_jnt=24, frame_skip=40,): # init vars self.time = 0 self.counter = 0 self.target_x = 0 self.target_y = 0 self.target_z = 0 self.grasp_sid = 0 self.obs_dict = {} self.reward_dict = {} self.frame_skip = frame_skip self.skip = frame_skip # dims assert n_jnt > 0 self.n_jnt = n_jnt self.n_obj_dofs = 9 ## 6 cube, 9 for cube + target joints self.n_dofs = self.n_jnt + self.n_obj_dofs self.xml_path = os.path.join(os.path.dirname(__file__), 'assets', 'shadowhand_hand_cube.xml') # Make robot self.robot=Robot( n_jnt=self.n_jnt, n_obj=self.n_obj_dofs, n_dofs = self.n_dofs, pos_bounds=[[-40, 40]] * self.n_dofs, #dummy vel_bounds=[[-3, 3]] * self.n_dofs, ) # Initialize mujoco env self.startup = True self.initializing = True super().__init__( self.xml_path, frame_skip=frame_skip, camera_settings=dict( distance=0.9, azimuth=-40, elevation=-50, ), ) utils.EzPickle.__init__(self) self.startup = False self.initializing = False #init joint and site ids self.target_x = self.sim.model.joint_name2id('targetRx') self.target_y = self.sim.model.joint_name2id('targetRy') self.target_z = self.sim.model.joint_name2id('targetRz') self.grasp_sid = self.sim.model.site_name2id('S_finger_grasp') # reset position self.init_qpos = self.sim.model.key_qpos[0].copy() # action ranges self.act_mid = self.init_qpos[:self.n_jnt] self.upper_rng = (self.model.actuator_ctrlrange[:,1]-self.act_mid) self.lower_rng = (self.act_mid-self.model.actuator_ctrlrange[:,0]) self.xml_location = os.path.join(os.path.dirname(__file__), 'assets', 'object_dice.xml') self.domain_low = 0.045 self.domain_high = 0.055 self.test_domain = 0.05 self.mode = 'train' def set_weight(self, weight): lock = FileLock(self.xml_location + '.lock') # concurrency protection def modify_dice_xml(dice_xml_file): et = xml.etree.ElementTree.parse(dice_xml_file) et.find('body').find('inertial').set('mass', "%0.4f" % weight) # changing dice weight et.write(dice_xml_file) with lock: modify_dice_xml(self.xml_location) self.sim_robot = MujocoSimRobot( self.xml_path, camera_settings=dict( distance=0.9, azimuth=-40, elevation=-50, )) self.sim = self.sim_robot.sim self.model = self.sim_robot.model self.data = self.sim_robot.data if self.sim_robot.renderer._onscreen_renderer: import ipdb; ipdb.set_trace() self.close() self.render() def get_reward(self, observations, actions): """get rewards of a given (observations, actions) pair Args: observations: (batchsize, obs_dim) or (obs_dim,) actions: (batchsize, ac_dim) or (ac_dim,) Return: r_total: reward for that pair: (batchsize,1) or (1,) done: True if env reaches terminal state: (batchsize,1) or (1,) """ #initialize and reshape as needed, for batch mode self.reward_dict = {} if len(observations.shape)==1: observations = np.expand_dims(observations, axis = 0) actions = np.expand_dims(actions, axis = 0) batch_mode = False else: batch_mode = True # obs: # self.obs_dict['robot_pos'], #24 # self.obs_dict['object_position'], #3 # self.obs_dict['object_orientation'], #3 # self.obs_dict['object_velp'], #3 # self.obs_dict['object_velr'], #3 # self.obs_dict['desired_orientation'], #3 #get vars obj_pos = observations[:, (24):(24)+3] obj_orientation = observations[:,(24+3):(24+3)+3] desired_orientation = observations[:,-3:] obj_height = observations[:,24+2] zeros = np.zeros(obj_height.shape) #orientation angle_diffs = np.linalg.norm(obj_orientation - desired_orientation, axis=1) #fall is_fall = zeros.copy() is_fall[obj_height < -0.1] = 1 #done based on is_fall dones = (is_fall==1) if not self.startup else zeros #rewards self.reward_dict['ori_dist'] = -7*angle_diffs self.reward_dict['drop_penalty'] = -1000*is_fall self.reward_dict['r_total'] = self.reward_dict['ori_dist'] + self.reward_dict['drop_penalty'] #return if not batch_mode: return self.reward_dict['r_total'][0], dones[0] return self.reward_dict['r_total'], dones def get_score(self, obs_dict): return -1.0*np.linalg.norm(obs_dict['object_orientation'] - obs_dict['desired_orientation']) def step(self, a): if self.startup: self.desired_pose = np.array([0,0,0]) else: self.desired_pose = self.goal[self.counter].copy() ############# ### ACTIONS ############## # clip and scale action a = np.clip(a, -1.0, 1.0) if self.startup: # only for the initialization phase a = a else: # mean center and scale # a = self.act_mid + a*self.act_rng a[a>0] = self.act_mid[a>0] + a[a>0]*self.upper_rng[a>0] a[a<=0] = self.act_mid[a<=0] + a[a<=0]*self.lower_rng[a<=0] # take the action self.robot.step(self, a, step_duration=self.skip*self.model.opt.timestep) ################## ### OBS AND REWARD ################## # get obs/rew/done/score obs = self._get_obs() reward, done = self.get_reward(obs, a) catastrophe = done obs = np.concatenate((obs, [float(catastrophe)])) score = self.get_score(self.obs_dict) #return env_info = {'time': self.time, 'obs_dict': self.obs_dict, 'rewards':self.reward_dict, 'score':score} self.counter +=1 return obs, reward, done, env_info def _get_obs(self): #update target pose visualization self.data.qpos[self.target_x] = self.desired_pose[0] self.data.qpos[self.target_y] = self.desired_pose[1] self.data.qpos[self.target_z] = self.desired_pose[2] #get obs self.robot.get_obs(self, robot_noise_ratio=0, object_noise_ratio=0) t, qp_hand, qv_hand, qp_obj, qv_obj = self.robot.get_obs_from_cache(self, -1) self.time = t self.obs_dict = {} self.obs_dict['robot_pos'] = qp_hand.copy() #24 self.obs_dict['object_position'] = qp_obj[:3].copy() #3 (x/y/z) translation pos of cube self.obs_dict['object_orientation'] = qp_obj[3:6].copy() #3 (r/p/y) rotational pos of cube self.obs_dict['object_velp'] = qv_obj[:3].copy() #3 translational vel of cube self.obs_dict['object_velr'] = qv_obj[3:6].copy() #3 rotational vel of cube self.obs_dict['desired_orientation'] = self.desired_pose.copy() #3 desired (r/p/y) orientation of cube return np.concatenate([ self.obs_dict['robot_pos'], #24 self.obs_dict['object_position'], #3 self.obs_dict['object_orientation'], #3 self.obs_dict['object_velp'], #3 self.obs_dict['object_velr'], #3 self.obs_dict['desired_orientation'], #3 ]) def reset_model(self, mode='train'): self.reset_pose = self.init_qpos.copy() self.reset_vel = self.init_qvel.copy() self.reset_goal = self.create_goal_trajectory() return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal, mode=mode) def do_reset(self, reset_pose, reset_vel, reset_goal=None, mode="train"): if mode == 'train': self.cube_weight = np.random.uniform(self.domain_low, self.domain_high) print(self.cube_weight) #self.set_weight(self.ball_weights) self.set_weight(self.cube_weight) elif self.mode != 'test' and mode == 'test': #starting adaptation self.mode = mode self.set_weight(self.test_domain) # reset counts self.counter=0 #### reset goal if reset_goal is None: self.goal = self.create_goal_trajectory() else: self.goal = reset_goal.copy() # reset hand and objects self.robot.reset(self, reset_pose, reset_vel) self.sim.forward() return np.concatenate((self._get_obs(), [0.])) def create_goal_trajectory(self): len_of_goals = 1000 ##################################### ##################################### #rotate 20 deg about y axis (cos(a/2), sin(a/2), 0, 0) (up/down) #rotate 20 deg about z axis (cos(a/2), 0, 0, sin(a/2)) (left/right) left = [0, 0, -1.5] right = [0, 0, 1.5] up = [1.5, 0, 0] down = [-1.5, 0, 0] half_up = [0.7, 0, 0] half_down = [-0.7, 0, 0] half_left = [0, 0, -0.7] half_right = [0, 0, 0.7] slight_up = [0.35, 0, 0] slight_down = [-0.35, 0, 0] slight_left = [0, 0, -0.35] slight_right = [0, 0, 0.35] ##################################### ##################################### # CHOOSE one of these goal options here: # goal_options = [half_up, half_down, half_left, half_right, slight_right, slight_left, slight_down, slight_up, left, right, up, down] # goal_options = [half_up, half_down, half_left, half_right] # goal_options = [half_up, half_down, half_left, half_right, slight_right, slight_left, slight_down, slight_up] # goal_options = [left, right] # goal_options = [up, down] goal_options = [left, right, up, down] ##################################### ##################################### # A single rollout consists of alternating between 2 (same or diff) goals: same_goals = True if same_goals: goal_selected1 = np.random.randint(len(goal_options)) goal_selected2 = goal_selected1 else: goal_selected1= np.random.randint(len(goal_options)) goal_selected2= np.random.randint(len(goal_options)) goals = [goal_options[goal_selected1], goal_options[goal_selected2]] # Create list of these goals time_per_goal = 35 step_num = 0 curr_goal_num = 0 goal_traj = [] while step_num<len_of_goals: goal_traj.append(np.tile(goals[curr_goal_num], (time_per_goal, 1))) if curr_goal_num==0: curr_goal_num=1 else: curr_goal_num=0 step_num+=time_per_goal goal_traj = np.concatenate(goal_traj) return goal_traj