def train(self): from algos.ppo4multienvs import PPO, ReplayBuffer from nets.network import ActorCritic_Norm as ActorCritic cl_args = self.cl_args log_save_name = cl_args.algo_id + '_' + cl_args.env_id + '_buffer_{}_batch_{}_hidden_{}_lr_{}_maxsteps_{}'.format( cl_args.buffer_size, cl_args.batch_size, cl_args.hidden_size, cl_args.learning_rate, cl_args.max_steps_per_episodes) log_save_path = os.path.join("./runs", log_save_name) if os.path.exists(log_save_path): shutil.rmtree(log_save_path) utli.writer = SummaryWriter(log_save_path) model_dir = utli.Save_model_dir(cl_args.algo_id, cl_args.env_id) # Create the environment to train on. num_envs = 8 def make_env(): def _thunk(): env = gym.make(cl_args.env_id) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(cl_args.env_id) env.seed(0) buffer_size = cl_args.buffer_size batch_size = cl_args.batch_size # Train for 1 million timesteps. num_steps = cl_args.num_steps state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) lr = LearningRate.get_instance() lr.lr = 10**(-3) lr.decay_factor = 0.5 lr = cl_args.learning_rate evaluate_every = cl_args.evaluate_every # The buffer replay_buffer = ReplayBuffer(num_total_sizes=buffer_size, obs_dims=state_dim, act_dims=action_dim, batch_size=batch_size) # network model = ActorCritic(state_dim=state_dim, action_dim=action_dim, hidden_size=128).to(device) # policy policy = PPO(model=model, replay_buf=replay_buffer, lr=lr, device=device) time_step = 0 # Evaluate the initial network evaluations = [] # begin optimize cur_state = envs.reset() reward_window = deque(maxlen=50) while time_step < num_steps: replay_buffer.clear() train_r = 0 for _ in range(buffer_size // batch_size): state = torch.FloatTensor(cur_state).unsqueeze(0) dist, value = model(state.to(device)) action = dist.sample() log_prob = dist.log_prob(action) action = action.cpu().detach().numpy()[0] log_prob = log_prob.cpu().detach().numpy()[0] value = value.cpu().detach().numpy()[0] next_state, reward, done, _ = envs.step(action) train_r += reward.sum() reward = np.expand_dims(reward, axis=1) done = np.expand_dims(done, axis=1) replay_buffer.add(cur_obs=cur_state, cur_action=action, reward=reward, done=done, old_log_prob=log_prob, value=value) cur_state = next_state time_step += 1 if time_step % evaluate_every == 0: evaluation, mean_reward, mean_step = self.evaluate_policy( env=env, model=model, time_step=time_step, evaluation_trajectories=6) evaluations.append(evaluation) reward_window.append(mean_reward) print(np.mean(reward_window)) utli.recordEvaluateResults( results=(mean_reward, mean_step, np.mean(reward_window)), time_step=time_step) # compute returns returns = policy.compute_gae(next_state=next_state) returns = replay_buffer.cat(returns) # training PPO policy value_losses, ppo_losses, entropys, losses = policy.train( returns=returns) utli.recordTrainResults(results=(train_r, np.mean(np.array(value_losses)), np.mean(np.array(ppo_losses)), np.mean(np.array(entropys)), np.mean(np.array(losses))), time_step=time_step) # last evalution last_evaluation, mean_reward, mean_step = self.evaluate_policy( env=env, model=model, time_step=time_step, evaluation_trajectories=6) evaluations.append(last_evaluation) reward_window.append(mean_reward) print(np.mean(reward_window)) utli.recordEvaluateResults(results=(mean_reward, mean_step, np.mean(reward_window)), time_step=time_step) # store results utli.store_results(evaluations, (time_step + 1), cl_args)
def train(self): from algos.ppo4Categorical import PPO, ReplayBuffer from nets.network import ActorCritic_Cate_zm01 as ActorCritic cl_args = self.cl_args log_save_name = cl_args.algo_id + '_' + cl_args.env_id + '_buffer_{}_batch_{}_hidden_{}_lr_{}_maxsteps_{}'.format( cl_args.buffer_size, cl_args.batch_size, cl_args.hidden_size, cl_args.learning_rate, cl_args.max_steps_per_episodes) log_save_path = os.path.join("./runs", log_save_name) if os.path.exists(log_save_path): shutil.rmtree(log_save_path) utli.writer = SummaryWriter(log_save_path) model_dir = utli.Save_model_dir(cl_args.algo_id, cl_args.env_id) # Create the environment to train on. env = gym.make(cl_args.env_id) env_evaluate = gym.make(cl_args.env_id) env = env.unwrapped env_evaluate = env_evaluate.unwrapped # env.seed(0) buffer_size = cl_args.buffer_size batch_size = cl_args.batch_size # Train for 1 million timesteps. num_steps = cl_args.num_steps state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # max_action = float(env.action_space.high[0]) lr = LearningRate.get_instance() lr.lr = 10**(-3) lr.decay_factor = 0.5 lr = cl_args.learning_rate ppo_epoch = cl_args.ppo_epoch evaluate_every = cl_args.evaluate_every max_steps_per_episodes = cl_args.max_steps_per_episodes stop_condition = cl_args.stop_condition use_device = cl_args.use_device # The buffer replay_buffer = ReplayBuffer(num_total_sizes=buffer_size, obs_dims=state_dim, act_dims=1, batch_size=batch_size) # network if use_device: model = ActorCritic(state_dim=state_dim, action_dim=action_dim, hidden_size=cl_args.hidden_size).to(device) else: model = ActorCritic(state_dim=state_dim, action_dim=action_dim, hidden_size=cl_args.hidden_size) # policy policy = PPO(model=model, replay_buf=replay_buffer, lr=lr, device=device, use_device=use_device, ppo_epoch=ppo_epoch, weight_epsilon=0.0) time_step = 0 # Evaluate the initial network evaluations = [] # begin optimize # cur_state = env.reset() reward_window4Train = deque(maxlen=100) reward_window4Evaluate = deque(maxlen=100) episode_t = 0 count = 0 S_time = time.time() while time_step < num_steps: episode_t += 1 cur_state = env.reset() path_length, path_rewards = 0, 0. while True: path_length += 1 time_step += 1 state = torch.FloatTensor(cur_state).unsqueeze(0) # state = torch.FloatTensor(cur_state[None]) if use_device: with torch.no_grad(): action, old_log_prob, value = model.select_action( state.to(device)) else: with torch.no_grad(): action, old_log_prob, value = model.select_action( state) next_state, reward, done, _ = env.step(action) # reward = np.expand_dims(reward, axis=1) # done = np.expand_dims(done, axis=1) replay_buffer.add(cur_obs=cur_state, cur_action=action, reward=reward, done=done, old_log_prob=old_log_prob.cpu(), value=value) cur_state = next_state if replay_buffer.enough_data: next_state = torch.FloatTensor(next_state).unsqueeze(0) if use_device: with torch.no_grad(): _, _, next_value = model.select_action( next_state.to(device)) else: with torch.no_grad(): _, _, next_value = model.select_action(next_state) # compute returns # returns = policy.compute_gae(next_state=next_state) returns = replay_buffer.compute_gae(next_value=next_value) # training PPO policy value_losses, ppo_losses, entropys, losses = policy.train( returns=returns) utli.recordLossResults( results=(np.mean(np.array(value_losses)), np.mean(np.array(ppo_losses)), np.mean(np.array(entropys)), np.mean(np.array(losses))), time_step=time_step) replay_buffer.clear() path_rewards += reward if done or max_steps_per_episodes == path_length: break if time_step % evaluate_every == 0: evaluation, mean_reward, mean_step = self.evaluate_policy( env=env_evaluate, model=model, time_step=time_step, use_device=use_device, max_step=max_steps_per_episodes, evaluation_trajectories=6) evaluations.append(evaluation) reward_window4Evaluate.append(mean_reward) utli.recordEvaluateResults( results=(mean_reward, mean_step, np.mean(reward_window4Evaluate)), time_step=time_step) reward_window4Train.append(path_rewards) utli.recordTrainResults(results=(path_rewards, path_length, np.mean(reward_window4Train)), time_step=time_step) print( "Episode: %d, Time steps: %d, Path length: %d Reward: %f" % (episode_t, time_step, path_length, path_rewards)) count = utli.Save_trained_model( count=count, num=cl_args.num_model, model=model, model_dir=model_dir, stop_condition=stop_condition, reward_window4Train=reward_window4Train, reward_window4Evaluate=reward_window4Evaluate) # last evalution evaluation, mean_reward, mean_step = self.evaluate_policy( env=env_evaluate, model=model, time_step=time_step, use_device=use_device, max_step=max_steps_per_episodes, evaluation_trajectories=6) evaluations.append(evaluation) reward_window4Evaluate.append(mean_reward) utli.recordEvaluateResults(results=(mean_reward, mean_step, np.mean(reward_window4Evaluate)), time_step=time_step) E_time = time.time() # store results utli.store_results(evaluations, (time_step + 1), cl_args, S_time=S_time, E_time=E_time)