def main(args): """ Starts different tests Args: param1(args): args """ size = 84 use_render = False use_render = True print("use render {} ".format(use_render)) env = suite.make( args.env_name, has_renderer=use_render, use_camera_obs=True, ignore_done=True, has_offscreen_renderer=True, camera_height=size, camera_width=size, render_collision_mesh=use_render, render_visual_mesh=True, camera_name='agentview', use_object_obs=False, camera_depth=False, reward_shaping=True, ) state = env.reset() state_dim = 200 action_dim = env.dof max_action = float(1) min_action = float(-1) policy = TD31v1(state_dim, action_dim, max_action, args) directory = "24_07_lr_2_lift/pytorch_models/" filename ="SawyerLift-701reward_75.21-agentTD3_ad" filename = directory + filename print("Load " , filename) policy.load(filename) avg_reward = 0. seeds = [x for x in range(10)] episode = 1 for s in seeds: torch.manual_seed(s) np.random.seed(s) print("iteration ", s) obs = env.reset() obs, state_buffer = stacked_frames(obs, size, args, policy) done = False for x in range(200): action = policy.select_action(np.array(obs)) obs , reward, done, _ = env.step(action) obs, state_buffer = create_next_obs(obs, size, args, state_buffer, policy) avg_reward += reward * 10 if use_render: time.sleep(0.02) env.render() print("episode reward {}".format(avg_reward/episode)) episode += 1 avg_reward /= len(seeds) print ("---------------------------------------") print ("Average Reward over the Evaluation Step: %f" % (avg_reward)) print ("---------------------------------------")
def train_agent(args, param): """ Args: """ use_gym = False args.seed = param now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str( args.policy) pathname += "_batch_size_" + str(args.batch_size) pathname += '_update_freq: ' + str( args.target_update_freq) + "num_q_target_" + str( args.num_q_target) + "_seed_" + str(args.seed) pathname += "_actor_300_200" text = "Star_training target_update_freq: {} num_q_target: {} use device {} ".format( args.target_update_freq, args.num_q_target, args.device) print(pathname, text) write_into_file(pathname, text) arg_text = str(args) write_into_file(pathname, arg_text) tensorboard_name = str(args.locexp) + '/runs/' + pathname writer = SummaryWriter(tensorboard_name) if use_gym: env = gym.make(args.env_name) env.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) args.max_episode_steps = env._max_episode_steps else: size = 84 env = suite.make( args.env_name, has_renderer=False, use_camera_obs=True, ignore_done=True, has_offscreen_renderer=True, camera_height=size, camera_width=size, render_collision_mesh=False, render_visual_mesh=True, camera_name='agentview', use_object_obs=False, camera_depth=True, reward_shaping=True, ) state_dim = 200 print("State dim, ", state_dim) action_dim = env.dof print("action_dim ", action_dim) max_action = 1 args.max_episode_steps = 200 if args.policy == "TD3_ad": policy = TD31v1(state_dim, action_dim, max_action, args) elif args.policy == "DDPG": policy = DDPG(state_dim, action_dim, max_action, args) file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name) obs_shape = (3, 84, 84) action_shape = (action_dim, ) print("obs", obs_shape) print("act", action_shape) replay_buffer = ReplayBuffer(obs_shape, action_shape, int(args.buffer_size), args.device) save_env_vid = False total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] tb_update_counter = 0 while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 scores_window.append(episode_reward) average_mean = np.mean(scores_window) if tb_update_counter > args.tensorboard_freq: print("Write tensorboard") tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) writer.flush() # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, episode_num) text += "Episode steps {} ".format(episode_timesteps) text += "Reward: {:.2f} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) write_into_file(pathname, text) # We evaluate the episode and we save the policy if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 200) if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, env)) torch.manual_seed(args.seed) np.random.seed(args.seed) save_model = file_name + '-{}reward_{:.2f}-agent{}'.format( episode_num, evaluations[-1], args.policy) policy.save(save_model) # When the training step is done, we reset the state of the environment if use_gym: obs = env.reset() else: state = env.reset() obs, state_buffer = stacked_frames(state, size, args, policy) # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: if use_gym: action = env.action_space.sample() else: action = np.random.randn(env.dof) else: # After 10000 timesteps, we switch to the model if use_gym: action = policy.select_action(np.array(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) else: action = (policy.select_action(np.array(obs)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip(-max_action, max_action) if total_timesteps % args.target_update_freq == 0: if args.policy == "TD3_ad": policy.hardupdate() # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = env.step(action) done = float(done) if not use_gym: new_obs, state_buffer = create_next_obs(new_obs, size, args, state_buffer, policy) # We check if the episode is done done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float( done) if not use_gym: if episode_timesteps + 1 == args.max_episode_steps: done = True # We increase the total reward reward = reward * args.reward_scalling episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) if args.debug: print("add to buffer next_obs ", obs.shape) print("add to bufferobs ", new_obs.shape) replay_buffer.add(obs, action, reward, new_obs, done, done_bool) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 0) episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # We add the last policy evaluation to our list of evaluations and we save our model evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num))
def main(args): """ Starts different tests Args: param1(args): args """ size = 84 use_render = False # use_render = True print("use render {} ".format(use_render)) env = suite.make( args.env_name, has_renderer=use_render, use_camera_obs=True, ignore_done=True, has_offscreen_renderer=True, camera_height=size, camera_width=size, render_collision_mesh=use_render, render_visual_mesh=True, camera_name='agentview', use_object_obs=False, camera_depth=False, reward_shaping=True, ) state = env.reset() state_dim = 200 action_dim = env.dof max_action = float(1) min_action = float(-1) width = size height = size fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G') fps = 30 video_filename = 'output.avi' video = cv2.VideoWriter(video_filename, fourcc, fps, (width, height)) policy = TD31v1(state_dim, action_dim, max_action, args) directory = "24_07_lr_2_lift/pytorch_models/" filename = "SawyerLift-701reward_75.21-agentTD3_ad" filename = directory + filename print("Load ", filename) policy.load(filename) avg_reward = 0. seeds = [x for x in range(args.repeat)] episode = 1 for s in seeds: torch.manual_seed(s) np.random.seed(s) print("iteration ", s) obs = env.reset() obs, state_buffer = stacked_frames(obs, size, args, policy) done = False for x in range(args.timesteps): action = policy.select_action(np.array(obs)) obs, reward, done, _ = env.step(action) img = obs["image"] gray = cv2.normalize(img, None, 255, 0, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U) name = "images/state-{}.jpg".format(x + 1000) im = Image.fromarray(img) im.save(name) frame = cv2.merge([gray, gray, gray]) video.write(frame) obs, state_buffer = create_next_obs(obs, size, args, state_buffer, policy) avg_reward += reward * 10 if use_render: time.sleep(0.02) env.render() print("episode reward {}".format(avg_reward / episode)) episode += 1 avg_reward /= len(seeds) cv2.destroyAllWindows() video.release() print("---------------------------------------") print("Average Reward over the Evaluation Step: %f" % (avg_reward)) print("---------------------------------------")