def __init__(self, opt): self.opt = opt self.env = gym.make(opt.env) if "SawyerPush" in self.opt.env: self.env = SawyerECWrapper(self.env, opt.env) self.env._max_episode_steps = 70 self.env.seed(0) random.seed(0) try: self.state_dim = self.env.observation_space.shape[0] except: self.state_dim = 16 #self.env.observation_space.shape[0] try: self.action_dim = self.env.action_space.shape[0] except: self.action_dim = 2 self.max_action = float(self.env.action_space.high[0]) self.log_root = opt.log_root self.episode_n = opt.episode_n self.policy_path = os.path.join( opt.log_root, '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env)) if opt.load_policy != "": print(self.policy_path) self.policy = TD3(opt.load_policy, self.state_dim, self.action_dim, self.max_action) self.setup(opt) self.create_data() print('----------- Dataset initialized ---------------') print('-----------------------------------------------\n')
def __init__(self, opt): self.opt = opt self.env_name = opt.env self.policy_path = os.path.join( opt.log_root, '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env)) self.state_dim = opt.state_dim1 self.action_dim = opt.action_dim1 self.max_action = 1 print(self.env_name, self.state_dim, self.action_dim) self.policy = TD3(self.policy_path, self.state_dim, self.action_dim, self.max_action, self.opt) self.env = gym.make(self.env_name) if "SawyerPush" in self.opt.env: self.env = SawyerECWrapper(self.env, opt.env) self.env._max_episode_steps = 70 self.env.seed(100)
def eval_policy(policy, env_name, seed, eval_episodes=10): eval_env = gym.make(env_name) if "SawyerPush" in args.env: eval_env = SawyerECWrapper(eval_env, args.env) eval_env._max_episode_steps = 70 eval_env.seed(seed + 100) avg_reward = 0. success_rate = 0. for _ in range(eval_episodes): state, done = eval_env.reset(), False state = flatten_state(state) while not done: action = policy.select_action(np.array(flatten_state(state))) state, reward, done, info = eval_env.step(action) if ("first_success" in info.keys() and info["first_success"]): success_rate += 1 avg_reward += reward avg_reward /= eval_episodes success_rate /= eval_episodes print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}, {success_rate:.3f}") print("---------------------------------------") return avg_reward
def main(args): file_name = f"{args.policy}_{args.env}_{args.seed}" print("---------------------------------------") print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}") print("---------------------------------------") log_path = safe_path(os.path.join(args.log_root, '{}_base'.format(args.env))) result_path = safe_path(os.path.join(log_path, 'results')) model_path = safe_path(os.path.join(log_path, 'models')) ''' ### s2r hacks s2r_parser = argparse.ArgumentParser() s2r_parser.add_argument("--encoder_type", default="mlp") s2r_parser.add_argument("--end_effector", default=True) s2r_parser.add_argument("--screen_width", type=int, default=480) s2r_parser.add_argument("--screen_height", type=int, default=480) s2r_parser.add_argument("--action_repeat", type=int, default=1) s2r_parser.add_argument("--puck_friction", type=float, default=2.0) s2r_parser.add_argument("--puck_mass", type=float, default=0.01) s2r_parser.add_argument("--unity", default=False) s2r_parser.add_argument("--unity_editor", default=False) s2r_parser.add_argument("--virtual_display", default=None) s2r_parser.add_argument("--port", default=1050) s2r_parser.add_argument("--absorbing_state", default=False) s2r_parser.add_argument("--dr", default=False) s2r_parser.add_argument("--env", default=None) s2r_args = s2r_parser.parse_args() import ipdb;ipdb.set_trace() env = make_s2r_env(args.env, s2r_args, env_type="real") ''' env = gym.make(args.env) if "SawyerPush" in args.env: env = SawyerECWrapper(env, args.env) env._max_episode_steps = 70 # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) try: state_dim = env.observation_space.shape[0] except: state_dim = 16 #env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env, args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 success = False reach_reward = 0 push_reward = 0 cylinder_to_target = 100 for t in range(int(args.max_timesteps)): state = flatten_state(state) episode_timesteps += 1 # Select action randomly or according to policy if t < args.start_timesteps: action = env.action_space.sample() else: action = ( policy.select_action(np.array(state)) + np.random.normal(0, max_action * args.expl_noise, size=action_dim) ).clip(-max_action, max_action) # Perform action next_state, reward, done, info = env.step(action) next_state = flatten_state(next_state) done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0 if ("first_success" in info.keys() and info["first_success"]): success = True # reach_reward += info["reward_reach"] # push_reward += info["reward_push"] # cylinder_to_target = min(cylinder_to_target, info["cylinder_to_target"]) # Store data in replay buffer replay_buffer.add(state, action, next_state, reward, done_bool) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True # reach_reward /= episode_timesteps # push_reward /= episode_timesteps # Reach Reward: {reach_reward:.3f} Push Reward: {push_reward:.3f} cylinder_to_target: {cylinder_to_target:.3f} print( f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Success: {success}") # Reset environment success = False state, done = env.reset(), False episode_reward = 0 reach_reward, push_reward = 0, 0 cylinder_to_target = 100 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % args.eval_freq == 0: evaluations.append(eval_policy(policy, args.env, args.seed)) np.save(os.path.join(result_path, '{}'.format(file_name)), evaluations) if args.save_model: policy.save(os.path.join(model_path, '{}'.format(file_name)))
class CrossImgPolicy: def __init__(self, opt): self.opt = opt self.env_name = opt.env self.policy_path = os.path.join( opt.log_root, '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env)) self.state_dim = opt.state_dim1 self.action_dim = opt.action_dim1 self.max_action = 1 print(self.env_name, self.state_dim, self.action_dim) self.policy = TD3(self.policy_path, self.state_dim, self.action_dim, self.max_action, self.opt) self.env = gym.make(self.env_name) if "SawyerPush" in self.opt.env: self.env = SawyerECWrapper(self.env, opt.env) self.env._max_episode_steps = 70 self.env.seed(100) def eval_policy(self, iter, gxmodel=None, axmodel=None, imgpath=None, eval_episodes=10): eval_env = self.env state_buffer = [] action_buffer = [] avg_reward, new_reward = 0., 0. success_rate = 0. save_flag = False if imgpath is not None: if not os.path.exists(imgpath): os.mkdir(imgpath) save_flag = True for i in tqdm(range(eval_episodes)): state, done = eval_env.reset(), False if save_flag: episode_path = os.path.join( imgpath, 'iteration_{}_episode_{}.mp4'.format(iter, i)) frames = [] count = 0 while not done: state = np.array(flatten_state(state)) img, depth = self.env.sim.render(mode='offscreen', width=100, height=100, depth=True) with torch.no_grad(): action = self.policy.select_cross_action( img, gxmodel, axmodel) state_buffer.append(state) action_buffer.append(action) state, reward, done, info = eval_env.step(action) state = flatten_state(state) if ("first_success" in info.keys() and info["first_success"] == 1): success_rate += 1 elif ("episode_success" in info.keys() and info["episode_success"] == True): success_rate += 1 avg_reward += reward if save_flag: img = eval_env.sim.render(mode='offscreen', camera_name='track', width=500, height=500) frames.append(img[::-1, :, :]) count += 1 if save_flag: self._save_video(episode_path, frames) if i >= 3: save_flag = False avg_reward /= eval_episodes success_rate /= eval_episodes print("-----------------------------------------------") print("Evaluation over {} episodes: {:.3f}, {:.3f}".format( eval_episodes, avg_reward, success_rate)) print("-----------------------------------------------") return avg_reward, success_rate def _save_video(self, fname, frames, fps=15.0): """ Saves @frames into a video with file name @fname. """ def f(t): frame_length = len(frames) new_fps = 1.0 / (1.0 / fps + 1.0 / frame_length) idx = min(int(t * new_fps), frame_length - 1) return frames[idx] video = mpy.VideoClip(f, duration=len(frames) / fps + 2) video.write_videofile(fname, fps, verbose=False)
class CycleData: def __init__(self, opt): self.opt = opt self.env = gym.make(opt.env) if "SawyerPush" in self.opt.env: self.env = SawyerECWrapper(self.env, opt.env) self.env._max_episode_steps = 70 self.env.seed(0) random.seed(0) try: self.state_dim = self.env.observation_space.shape[0] except: self.state_dim = 16 #self.env.observation_space.shape[0] try: self.action_dim = self.env.action_space.shape[0] except: self.action_dim = 2 self.max_action = float(self.env.action_space.high[0]) self.log_root = opt.log_root self.episode_n = opt.episode_n self.policy_path = os.path.join( opt.log_root, '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env)) if opt.load_policy != "": print(self.policy_path) self.policy = TD3(opt.load_policy, self.state_dim, self.action_dim, self.max_action) self.setup(opt) self.create_data() print('----------- Dataset initialized ---------------') print('-----------------------------------------------\n') def setup(self, opt): self.episode_n = opt.episode_n self.env_logs = safe_path( os.path.join(self.log_root, '{}_data'.format(self.opt.env))) self.data_root = safe_path( os.path.join(self.env_logs, '{}_{}'.format(self.opt.data_type, self.opt.data_id))) self.img_path = safe_path(os.path.join(self.data_root, 'imgs')) def create_data(self): self.reset_buffer() total_samples = 0 i_episode = 0 while total_samples < self.episode_n: observation, done, t = self.env.reset(), False, 0 observation = flatten_state(observation) self.add_observation(observation) # episode_path = os.path.join(self.img_path,'episode-{}'.format(i_episode)) # if not os.path.exists(episode_path): # os.mkdir(episode_path) # path = os.path.join(episode_path, 'img_{}_{}.jpg'.format(i_episode, 0)) # self.check_and_save(path) i_episode += 1 while not done: if self.opt.load_policy != "": action = self.policy.select_action(observation) else: action = self.env.action_space.sample() observation, reward, done, info = self.env.step(action) observation = flatten_state(observation) self.add_action(action) self.add_observation(observation) # path = os.path.join(episode_path, 'img_{}_{}.jpg'.format(i_episode, t + 1)) # self.check_and_save(path) t += 1 if done: print("Episode {} finished after {} timesteps".format( i_episode, t)) total_samples += t break self.merge_buffer() print("{} total samples collected".format(total_samples)) self.collect_data() def check_and_save(self, path): img = self.env.sim.render(mode='offscreen', camera_name='track', width=256, height=256, depth=False) img = Image.fromarray(img[::-1, :, :]) img.save(path) def collect_data(self): self.env.close() self.norm_state() self.pair_n = self.now_state.shape[0] assert (self.pair_n == self.next_state.shape[0]) assert (self.pair_n == self.action.shape[0]) self.save_npy() def norm_state(self): self.now_state = np.vstack(self.now_state) self.next_state = np.vstack(self.next_state) self.action = np.vstack(self.action) def save_npy(self): np.save(os.path.join(self.data_root, 'now_state.npy'), self.now_state) np.save(os.path.join(self.data_root, 'next_state.npy'), self.next_state) np.save(os.path.join(self.data_root, 'action.npy'), self.action) def reset_buffer(self): self.joint_pose_buffer = [] self.achieved_goal_buffer = [] self.goal_pos_buffer = [] self.action_buffer = [] self.now_state = [] self.next_state = [] self.action = [] def add_observation(self, observation): self.joint_pose_buffer.append(observation) def add_action(self, action): self.action_buffer.append(action) def merge_buffer(self): self.now_state += self.joint_pose_buffer[:-1] self.next_state += self.joint_pose_buffer[1:] self.action += self.action_buffer self.joint_pose_buffer = [] self.achieved_goal_buffer = [] self.goal_pos_buffer = [] self.action_buffer = []