def evaluate_policy(env, policy, args, eval_episodes=10): avg_reward = 0. for _ in range(eval_episodes): obs = env.reset() if 'RNN' in args.policy_name: obs_vec = np.dot(np.ones((args.seq_len, 1)), obs.reshape((1, -1))) done = False while not done: if 'RNN' in args.policy_name: action = policy.select_action(np.array(obs_vec)) else: action = policy.select_action(np.array(obs)) if 'IM' in args.policy_name: action_im = np.copy(action) action = utils.calc_torque_from_impedance(action_im, np.asarray(obs)[8:-2]) obs, reward, done, _ = env.step(action) if 'RNN' in args.policy_name: obs_vec = utils.fifo_data(obs_vec, obs) avg_reward += reward avg_reward /= eval_episodes # print ("---------------------------------------" ) # print ("Evaluation over %d episodes: %f" % (eval_episodes, avg_reward)) # print ("---------------------------------------" ) return avg_reward
def eval_only(self, is_reset = True): video_dir = '{}/video_all/{}_{}_{}'.format(self.result_path, self.args.policy_name, self.args.env_name, self.args.reward_name) if not os.path.exists(video_dir): os.makedirs(video_dir) model_path_vec = glob.glob(self.result_path + '/{}/{}_{}_{}_seed*'.format( self.args.log_path, self.args.policy_name, self.args.env_name, self.args.reward_name)) print(model_path_vec) for model_path in model_path_vec: # print(model_path) self.policy.load("%s" % (self.file_name + self.args.load_policy_idx), directory=model_path) for _ in range(1): video_name = video_dir + '/{}_{}_{}.mp4'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), self.file_name, self.args.load_policy_idx) if self.args.save_video: fourcc = cv2.VideoWriter_fourcc(*'mp4v') out_video = cv2.VideoWriter(video_name, fourcc, 60.0, self.args.video_size) obs = self.env.reset() # print(self.env.step(np.asarray([0, 0, 0, 0, 0, 0]))) if 'RNN' in self.args.policy_name: obs_vec = np.dot(np.ones((self.args.seq_len, 1)), obs.reshape((1, -1))) obs_mat = np.asarray(obs) done = False while not done: if 'RNN' in self.args.policy_name: action = self.policy.select_action(np.array(obs_vec)) else: action = self.policy.select_action(np.array(obs)) if 'IM' in self.args.policy_name: action_im = np.copy(action) action = utils.calc_torque_from_impedance(action_im, np.asarray(obs)[8:-2]) obs, reward, done, _ = self.env.step(action) if 'RNN' in self.args.policy_name: obs_vec = utils.fifo_data(obs_vec, obs) if 0 != self.args.state_noise: obs[8:20] += np.random.normal(0, self.args.state_noise, size=obs[8:20].shape[0]).clip( -1, 1) obs_mat = np.c_[obs_mat, np.asarray(obs)] if self.args.save_video: img = self.env.render(mode='rgb_array') img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) out_video.write(img) elif self.args.render: self.env.render(mode='human') if not self.args.render: utils.write_table(video_name + '_state', np.transpose(obs_mat)) if self.args.save_video: out_video.release() if is_reset: self.env.reset()
def update_gait_reward(self, new_obs, reward): self.foot_contact_vec = utils.fifo_data(self.foot_contact_vec, new_obs[-2]) if 0 == np.std(self.foot_contact_vec): self.foot_contact = np.mean(self.foot_contact_vec) if 1 == (self.foot_contact - self.pre_foot_contact): if self.gait_state_mat.shape[0] > int(100 / self.env_timeStep): self.gait_num += 1 if self.gait_num >= 2: coefficient, cross_gait_reward_str = utils.calc_cross_gait_reward( self.gait_state_mat[:-self.delay_num + 1, :-2], self.gait_state_mat[:-self.delay_num + 1, -2], self.args.reward_name) self.reward_str_list += cross_gait_reward_str # print('gait_num:', self.gait_num, 'time steps in a gait: ', self.gait_state_mat.shape[0], # 'reward_str: ', utils.connect_str_list(list(set(self.reward_str_list))), # 'coefficient: ', np.round(coefficient, 2), # 'speed: ', np.round(np.linalg.norm(new_obs[3:6]), 2), # 'is cross gait: ', utils.check_cross_gait(self.gait_state_mat[:-self.delay_num, :-1])) self.reward_str_list = [] self.replay_buffer.add_final_reward( coefficient, self.gait_state_mat.shape[0] - self.delay_num, delay=self.delay_num) reward_steps = min(int(2000 / self.env_timeStep), len(self.reward_angle)) if 'r_n' in self.args.reward_name: self.reward_str_list.append('r_n') self.replay_buffer.add_specific_reward( self.reward_angle[-reward_steps:], self.idx_angle[-reward_steps:]) self.idx_angle = np.r_[self.idx_angle, self.gait_state_mat[:-self.delay_num, -1]] self.reward_angle = np.r_[ self.reward_angle, 0.05 * np.ones(self.gait_state_mat[:-self.delay_num, -1].shape[0])] self.gait_state_mat = self.gait_state_mat[-self.delay_num:] self.pre_foot_contact = self.foot_contact gait_state = np.zeros((1, 10)) gait_state[0, 0:6] = new_obs[8:20:2] gait_state[0, 6:-2] = new_obs[-2:] gait_state[0, -2] = new_obs[3] gait_state[0, -1] = self.total_timesteps self.gait_state_mat = np.r_[self.gait_state_mat, gait_state] return reward
def cal_true_value(env, policy, replay_buffer, args, eval_episodes=1000): true_Q_val_vec = [] init_state_vec, _, _, _, _ = replay_buffer.sample(eval_episodes) for i in range(eval_episodes): # if 0 == i % 100: # print(i) env.reset() if 'RNN' in args.policy_name: obs, obs_error = env.set_robot(init_state_vec[i][-1]) obs_vec = np.copy(init_state_vec[i]) obs_vec[-1] = np.copy(obs) else: obs, obs_error = env.set_robot(init_state_vec[i]) true_Q_value = 0. if obs_error > 1e-3: print( 'Error of resetting robot: {},\n input obs: {},\n output obs: {}' .format(obs_error, init_state_vec[i], obs)) continue done = False dis_gamma = 1. while not done: if 'RNN' in args.policy_name: action = policy.select_action(np.array(obs_vec)) else: action = policy.select_action(np.array(obs)) if 'IM' in args.policy_name: action_im = np.copy(action) action = utils.calc_torque_from_impedance( action_im, np.asarray(obs)[8:-2]) # action = np.zeros(6, dtype=float) obs, reward, done, _ = env.step(action) reward -= 0.5 true_Q_value += dis_gamma * reward dis_gamma *= args.discount if 'RNN' in args.policy_name: obs_vec = utils.fifo_data(obs_vec, obs) # env.render() true_Q_val_vec.append(true_Q_value) return np.mean(np.asarray(true_Q_val_vec))
def evaluate_policy(env, policy, args, eval_episodes=1): avg_reward = 0. for _ in range(eval_episodes): obs = env.reset() if 'seq' in args.method_name: obs_vec = np.dot(np.ones((args.seq_len, 1)), obs.reshape((1, -1))) done = False while not done: if 'seq' in args.method_name: action = policy.select_action(np.array(obs_vec)) else: action = policy.select_action(np.array(obs)) obs, reward, done, _ = env.step(action) if 'seq' in args.method_name: obs_vec = utils.fifo_data(obs_vec, obs) # print('obs_vec: ', obs_vec) avg_reward += reward avg_reward /= eval_episodes # print ("---------------------------------------" ) # print ("Evaluation over %d episodes: %f" % (eval_episodes, avg_reward)) # print ("---------------------------------------" ) return avg_reward
def main(env, method_name='', policy_name='TD3', state_noise=0.0, seed=0): parser = argparse.ArgumentParser() parser.add_argument("--policy_name", default=policy_name) # Policy name parser.add_argument("--env_name", default="Webots_Atlas") # OpenAI gym environment name parser.add_argument("--log_path", default='runs/ATD3_Atlas') parser.add_argument("--eval_only", default=True) parser.add_argument("--save_video", default=False) parser.add_argument( "--method_name", default=method_name, help='Name of your method (default: )') # Name of the method parser.add_argument("--seq_len", default=2, type=int) parser.add_argument("--ini_seed", default=0, type=int) # Sets Gym, PyTorch and Numpy seeds parser.add_argument("--seed", default=seed, type=int) # Sets Gym, PyTorch and Numpy seeds parser.add_argument( "--start_timesteps", default=1e4, type=int) # How many time steps purely random policy is run for parser.add_argument("--eval_freq", default=2e4, type=float) # How often (time steps) we evaluate parser.add_argument("--max_timesteps", default=1e6, type=float) # Max time steps to run environment for parser.add_argument("--save_models", default=True) # Whether or not models are saved parser.add_argument("--expl_noise", default=0.2, type=float) # Std of Gaussian exploration noise parser.add_argument("--state_noise", default=state_noise, type=float) # Std of Gaussian exploration noise parser.add_argument("--batch_size", default=100, type=int) # Batch size for both actor and critic parser.add_argument("--discount", default=0.99, type=float) # Discount factor parser.add_argument("--tau", default=0.005, type=float) # Target network update rate parser.add_argument( "--policy_noise", default=0.2, type=float) # Noise added to target policy during critic update parser.add_argument("--noise_clip", default=0.5, type=float) # Range to clip target policy noise parser.add_argument("--policy_freq", default=2, type=int) # Frequency of delayed policy updates args = parser.parse_args() args.seed += args.ini_seed args.seed = args.seed % 10 file_name = "TD3_%s_%s_%s" % (args.env_name, args.seed, args.method_name) print("---------------------------------------") print("Settings: %s" % (file_name)) print("---------------------------------------") result_path = project_path + "results" video_dir = '{}/video/{}_{}'.format(result_path, args.env_name, args.method_name) model_dir = '{}/models/TD3/{}_{}'.format(result_path, args.env_name, args.method_name) if args.save_models and not os.path.exists(model_dir): os.makedirs(model_dir) if args.save_video and not os.path.exists(video_dir): os.makedirs(video_dir) # env = gym.make(args.env_name) # Set seeds # env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if 'TD3' == args.policy_name: policy = TD3.TD3(state_dim, action_dim, max_action) elif 'ATD3' == args.policy_name: policy = ATD3.ATD3(state_dim, action_dim, max_action) elif 'ATD3_CNN' == args.policy_name: policy = ATD3_CNN.ATD3_CNN(state_dim, action_dim, max_action, args.seq_len) elif 'ATD3_RNN' == args.policy_name: policy = ATD3_RNN.ATD3_RNN(state_dim, action_dim, max_action) elif 'TD3_RNN' == args.policy_name: policy = TD3_RNN.TD3_RNN(state_dim, action_dim, max_action) if not args.eval_only: log_dir = '{}/{}/seed_{}_{}_{}_{}_{}'.format( result_path, args.log_path, args.seed, datetime.datetime.now().strftime("%d_%H-%M-%S"), args.policy_name, args.env_name, args.method_name) if not os.path.exists(log_dir): os.makedirs(log_dir) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(env, policy, args)] total_timesteps = 0 pre_num_steps = total_timesteps if 'human_angle' in args.method_name: human_joint_angle = utils.read_table(file_name=project_path + 'data/joint_angle.xls') pre_foot_contact = 1 foot_contact = 1 foot_contact_vec = np.asarray([1, 1, 1]) delay_num = foot_contact_vec.shape[0] - 1 gait_num = 0 gait_state_mat = np.zeros((0, 9)) idx_angle = np.zeros(0) reward_angle = np.zeros(0) if 'still_steps' in args.method_name: still_steps = 0 timesteps_since_eval = 0 episode_num = 0 done = True pbar = tqdm(total=args.max_timesteps, initial=total_timesteps, position=0, leave=True) best_reward = 0.0 # TesnorboardX writer = SummaryWriter(logdir=log_dir) while total_timesteps < args.max_timesteps: if done: if len(replay_buffer.storage) > env.frame: replay_buffer.add_final_reward(env.episode_reward / 1000.0, env.frame) pbar.update(total_timesteps - pre_num_steps) pre_num_steps = total_timesteps if total_timesteps != 0: writer.add_scalar('ave_reward/train', episode_reward, total_timesteps) # print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % # (total_timesteps, episode_num, episode_timesteps, episode_reward)) if args.policy_name == "TD3": policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq) else: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq avg_reward = evaluate_policy(env, policy, args) evaluations.append(avg_reward) writer.add_scalar('ave_reward/test', avg_reward, total_timesteps) if best_reward < avg_reward: best_reward = avg_reward print(( "Best reward! Total T: %d Episode Num: %d Episode T: %d Reward: %f" ) % (total_timesteps, episode_num, episode_timesteps, avg_reward)) if args.save_models: policy.save(file_name, directory=model_dir) policy.save(file_name, directory=log_dir) np.save(log_dir + "/test_accuracy", evaluations) utils.write_table(log_dir + "/test_accuracy", np.asarray(evaluations)) # else: # print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % # (total_timesteps, episode_num, episode_timesteps, avg_reward)) # Reset environment obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 if 'seq' in args.method_name: obs_vec = np.dot(np.ones((args.seq_len, 1)), obs.reshape((1, -1))) if 'human_angle' in args.method_name: pre_foot_contact = 1 foot_contact = 1 foot_contact_vec = np.asarray([1, 1, 1]) gait_num = 0 gait_state_mat = np.zeros((0, 9)) idx_angle = np.zeros(0) reward_angle = np.zeros(0) if 'still_steps' in args.method_name: still_steps = 0 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: if 'seq' in args.method_name: action = policy.select_action(np.array(obs_vec)) else: action = policy.select_action(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) episode_reward += reward if 'human_angle' in args.method_name: foot_contact_vec = utils.fifo_data(foot_contact_vec, new_obs[-2]) if 0 == np.std(foot_contact_vec): foot_contact = np.mean(foot_contact_vec) if 1 == (foot_contact - pre_foot_contact): if gait_state_mat.shape[0] > int(100 / env.timeStep): gait_num += 1 if gait_num >= 2: # cross_gait_reward = utils.calc_cross_gait_reward(gait_state_mat[:-delay_num + 1, :-1]) # # The ATD3 seems to prefer the negative similarity reward # # coefficient = utils.calc_gait_symmetry(joint_angle_sampled) joint_angle_sampled = signal.resample( gait_state_mat[:-delay_num, 0:6], num=human_joint_angle.shape[0]) coefficient = utils.calc_cos_similarity( human_joint_angle, joint_angle_sampled) - 0.5 # if best_reward > 1500: # coefficient += utils.calc_gait_symmetry(joint_angle_sampled) print( 'gait_num:', gait_num, 'time steps in a gait: ', gait_state_mat.shape[0], # 'cross_gait_reward: ', np.round(cross_gait_reward, 2), 'coefficient: ', np.round(coefficient, 2), 'speed: ', np.round(np.linalg.norm(new_obs[3:6]), 2), 'is cross gait: ', utils.check_cross_gait( gait_state_mat[:-delay_num, :-1])) replay_buffer.add_final_reward( coefficient, gait_state_mat.shape[0] - delay_num, delay=delay_num) reward_steps = min(int(2000 / env.timeStep), len(reward_angle)) replay_buffer.add_specific_reward( reward_angle[-reward_steps:], idx_angle[-reward_steps:]) idx_angle = np.r_[idx_angle, gait_state_mat[:-delay_num, -1]] reward_angle = np.r_[ reward_angle, 0.2 * np.ones(gait_state_mat[:-delay_num, -1].shape[0])] gait_state_mat = gait_state_mat[-delay_num:] pre_foot_contact = foot_contact joint_angle_obs = np.zeros((1, 9)) joint_angle_obs[0, 0:6] = obs[8:20:2] joint_angle_obs[0, 6:-1] = obs[-2:] joint_angle_obs[0, -1] = total_timesteps gait_state_mat = np.r_[gait_state_mat, joint_angle_obs] reward -= 0.5 if 'still_steps' in args.method_name: if np.array_equal(new_obs[-2:], np.asarray([1., 1.])): still_steps += 1 else: still_steps = 0 if still_steps > int(400 / env.timeStep): replay_buffer.add_final_reward(-2.0, still_steps - 1) reward -= 2.0 done = True done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float( done) if 'seq' in args.method_name: # Store data in replay buffer new_obs_vec = utils.fifo_data(np.copy(obs_vec), new_obs) replay_buffer.add( (np.copy(obs_vec), new_obs_vec, action, reward, done_bool)) obs_vec = utils.fifo_data(obs_vec, new_obs) else: replay_buffer.add((obs, new_obs, action, reward, done_bool)) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # Final evaluation evaluations.append(evaluate_policy(env, policy, args)) np.save(log_dir + "/test_accuracy", evaluations) utils.write_table(log_dir + "/test_accuracy", np.asarray(evaluations)) env.reset() else: if torch.cuda.is_available(): torch.cuda.empty_cache() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = result_path + '/{}/{}_{}'.format( args.log_path, args.method_name, args.seed + 1) print(model_path) policy.load("%s" % (file_name), directory=model_path) for _ in range(1): if args.save_video: fourcc = cv2.VideoWriter_fourcc(*'mp4v') video_name = video_dir + '/{}_{}_{}.mp4'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), file_name, args.state_noise) out_video = cv2.VideoWriter(video_name, fourcc, 60.0, (640, 480)) obs = env.reset() if 'seq' in args.method_name: obs_vec = np.dot(np.ones((args.seq_len, 1)), obs.reshape((1, -1))) obs_mat = np.asarray(obs) done = False while not done: if 'seq' in args.method_name: action = policy.select_action(np.array(obs_vec)) else: action = policy.select_action(np.array(obs)) obs, reward, done, _ = env.step(action) if 'seq' in args.method_name: obs_vec = utils.fifo_data(obs_vec, obs) obs[8:20] += np.random.normal(0, args.state_noise, size=obs[8:20].shape[0]).clip( -1, 1) obs_mat = np.c_[obs_mat, np.asarray(obs)] if args.save_video: utils.write_table(video_name + '_state', np.transpose(obs_mat)) utils.write_table(video_name + '_reward_Q', reward_Q1_Q2_mat) out_video.release() env.reset()
def train(self): # Evaluate untrained policy self.evaluations = [evaluate_policy(self.env, self.policy, self.args)] # self.log_dir = '{}/{}/seed_{}_{}_{}_{}'.format(self.result_path, self.args.log_path, self.args.seed, # datetime.datetime.now().strftime("%d_%H-%M-%S"), # self.args.policy_name, self.args.env_name, # self.args.reward_name) self.log_dir = '{}/{}/{}_{}_{}_seed_{}'.format(self.result_path, self.args.log_path, self.args.policy_name, self.args.env_name, self.args.reward_name, self.args.seed) print("---------------------------------------") print("Settings: %s" % self.log_dir) print("---------------------------------------") if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) # TesnorboardX if self.args.evaluate_Q_value: self.writer_train = SummaryWriter(logdir=self.log_dir + '_train') self.writer_test = SummaryWriter(logdir=self.log_dir) self.pbar = tqdm(total=self.args.max_timesteps, initial=self.total_timesteps, position=0, leave=True) done = True while self.total_timesteps < self.args.max_timesteps: self.train_once() if done: self.eval_once() self.reset() done = False # Select action randomly or according to policy if self.total_timesteps < self.args.start_timesteps: action = self.env.action_space.sample() else: if 'RNN' in self.args.policy_name: action = self.policy.select_action(np.array(self.obs_vec)) elif 'SAC' in self.args.policy_name: action = self.policy.select_action(np.array(self.obs), eval=False) else: action = self.policy.select_action(np.array(self.obs)) if self.args.expl_noise != 0: action = (action + np.random.normal(0, self.args.expl_noise, size=self.env.action_space.shape[0])).clip( self.env.action_space.low, self.env.action_space.high) if 'IM' in self.args.policy_name: action_im = np.copy(action) action = utils.calc_torque_from_impedance(action_im, np.asarray(self.obs)[8:-2]).clip( self.env.action_space.low, self.env.action_space.high) # Perform action new_obs, reward, done, _ = self.env.step(action) self.episode_reward += reward self.episode_progress += new_obs[3] if 'r_n' in self.args.reward_name: reward = self.update_gait_reward(new_obs, reward) if 'r_s' in self.args.reward_name: # reward -= 0.5 self.reward_str_list.append('r_s') # foot_dim = state_dim - 8 - 2*action_dim foot_num = len(self.env.observation_space.low) - 8 - 2 * len(self.env.action_space.low) if np.sum(new_obs[-foot_num:]) > 1: self.still_steps += 1 else: self.still_steps = 0 if self.still_steps > int(400 / self.env_timeStep): self.replay_buffer.add_final_reward(-2.0, self.still_steps - 1) reward -= 2.0 done = True done_bool = 0 if self.episode_timesteps + 1 == self.env._max_episode_steps else float(done) if 'IM' in self.args.policy_name: action = action_im if 'RNN' in self.args.policy_name: # Store data in replay buffer new_obs_vec = utils.fifo_data(np.copy(self.obs_vec), new_obs) self.replay_buffer.add((np.copy(self.obs_vec), new_obs_vec, action, reward, done_bool)) self.obs_vec = utils.fifo_data(self.obs_vec, new_obs) else: self.replay_buffer.add((self.obs, new_obs, action, reward, done_bool)) self.obs = new_obs self.episode_timesteps += 1 self.total_timesteps += 1 self.timesteps_since_eval += 1 self.timesteps_calc_Q_vale += 1 # Final evaluation avg_reward = evaluate_policy(self.env, self.policy, self.args) self.evaluations.append(avg_reward) if self.best_reward < avg_reward: self.best_reward = avg_reward print("Best reward! Total T: %d Episode T: %d Reward: %f" % (self.total_timesteps, self.episode_timesteps, avg_reward)) self.policy.save(self.file_name, directory=self.log_dir) if self.args.save_all_policy: self.policy.save(self.file_name + str(int(self.args.max_timesteps)), directory=self.log_dir) np.save(self.log_dir + "/test_accuracy", self.evaluations) utils.write_table(self.log_dir + "/test_accuracy", np.asarray(self.evaluations)) if self.args.evaluate_Q_value: true_Q_value = cal_true_value(env=self.env, policy=self.policy, replay_buffer=self.replay_buffer, args=self.args) self.writer_test.add_scalar('Q_value', true_Q_value, self.total_timesteps) self.true_Q_vals.append(true_Q_value) utils.write_table(self.log_dir + "/estimate_Q_vals", np.asarray(self.estimate_Q_vals)) utils.write_table(self.log_dir + "/true_Q_vals", np.asarray(self.true_Q_vals)) self.env.reset()