def train_agent(args, param): """ Args: """ # create CNN convert the [1,3,84,84] to [1, 200] use_gym = False # in case seed experements args.seed = param now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") #args.repeat_opt = repeat_opt torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str( args.policy) pathname += "_batch_size_" + str(args.batch_size) pathname += '_update_freq: ' + str( args.target_update_freq) + "num_q_target_" + str( args.num_q_target) + "_seed_" + str(args.seed) pathname += "_actor_300_200" text = "Star_training target_update_freq: {} num_q_target: {} use device {} ".format( args.target_update_freq, args.num_q_target, args.device) print(pathname, text) write_into_file(pathname, text) arg_text = str(args) write_into_file(pathname, arg_text) tensorboard_name = str(args.locexp) + '/runs/' + pathname writer = SummaryWriter(tensorboard_name) if use_gym: env = gym.make(args.env_name) env.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) args.max_episode_steps = env._max_episode_steps else: size = 84 env = suite.make( args.env_name, has_renderer=False, use_camera_obs=True, ignore_done=True, has_offscreen_renderer=True, camera_height=size, camera_width=size, render_collision_mesh=False, render_visual_mesh=True, camera_name='agentview', use_object_obs=False, camera_depth=True, reward_shaping=True, ) state_dim = 200 print("State dim, ", state_dim) action_dim = env.dof print("action_dim ", action_dim) max_action = 1 args.max_episode_steps = 200 policy = DDPG(state_dim, action_dim, max_action, args) file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name) obs_shape = (3, 84, 84) action_shape = (action_dim, ) print("obs", obs_shape) print("act", action_shape) replay_buffer = ReplayBuffer(obs_shape, action_shape, int(args.buffer_size), args.image_pad, args.device) save_env_vid = False total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] tb_update_counter = 0 while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 #env.seed(random.randint(0, 100)) scores_window.append(episode_reward) average_mean = np.mean(scores_window) if tb_update_counter > args.tensorboard_freq: print("Write tensorboard") tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) writer.flush() # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, episode_num) text += "Episode steps {} ".format(episode_timesteps) text += "Reward: {:.2f} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) write_into_file(pathname, text) #policy.train(replay_buffer, writer, episode_timesteps) # We evaluate the episode and we save the policy if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 200) if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, env)) torch.manual_seed(args.seed) np.random.seed(args.seed) save_model = file_name + '-{}reward_{:.2f}-agent{}'.format( episode_num, evaluations[-1], args.policy) policy.save(save_model) # When the training step is done, we reset the state of the environment if use_gym: obs = env.reset() else: state = env.reset() obs, state_buffer = stacked_frames(state, size, args, policy) # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: if use_gym: action = env.action_space.sample() else: action = np.random.randn(env.dof) else: # After 10000 timesteps, we switch to the model if use_gym: action = policy.select_action(np.array(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) else: action = (policy.select_action(np.array(obs)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip(-max_action, max_action) if total_timesteps % args.target_update_freq == 0: if args.policy == "TD3_ad": policy.hardupdate() # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = env.step(action) done = float(done) if not use_gym: new_obs, state_buffer = create_next_obs(new_obs, size, args, state_buffer, policy) # We check if the episode is done #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float( done) if not use_gym: if episode_timesteps + 1 == args.max_episode_steps: done = True # We increase the total reward reward = reward * args.reward_scalling episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) if args.debug: print("add to buffer next_obs ", obs.shape) print("add to bufferobs ", new_obs.shape) replay_buffer.add(obs, action, reward, new_obs, done, done_bool) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 0) episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # We add the last policy evaluation to our list of evaluations and we save our model evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num))
ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state print('Episode:{}/{}, Reward:{}'.format(i_episode + 1, cfg.train_eps, ep_reward)) ep_steps.append(i_step) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) else: ma_rewards.append(ep_reward) print('Complete training!') return rewards, ma_rewards if __name__ == '__main__': cfg = DDPGConfig() env = NormalizedActions(gym.make('Pendulum-v0')) env.seed(1) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] agent = DDPG(state_dim, action_dim, cfg) rewards, ma_rewards = train(cfg, env, agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag="train", algo=cfg.algo, path=RESULT_PATH)