def train(memory): batch = memory.sample() batch_states = DOUBLE(batch.state).to(device) batch_actions = DOUBLE(batch.action).to(device) batch_log_probs = DOUBLE(batch.log_prob).to(device) batch_masks = DOUBLE(batch.mask).to(device) batch_rewards = DOUBLE(batch.reward).to(device) batch_size = batch_states.shape[0] with torch.no_grad(): batch_values = critic(batch_states) batch_advantages, batch_returns = estimate_advantages(batch_rewards, batch_masks, batch_values, gamma, tau) # mini-batch ppo update mini_batch_num = int(math.ceil(batch_size / mini_batch_size)) for _ in range(ppo_epochs): idx = torch.randperm(batch_size) for i in range(mini_batch_num): mini_batch_idx = idx[i * mini_batch_size: min((i + 1) * mini_batch_size, batch_size)] mini_batch_states, mini_batch_actions, mini_batch_log_probs, mini_batch_returns, mini_batch_advantages = \ batch_states[mini_batch_idx], batch_actions[mini_batch_idx], batch_log_probs[mini_batch_idx], \ batch_returns[mini_batch_idx], batch_advantages[mini_batch_idx] ppo_step(actor, critic, opt_p, opt_v, 1, mini_batch_states, mini_batch_actions, mini_batch_returns, mini_batch_advantages, mini_batch_log_probs, epsilon, 1e-3)
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "PPO_mini_batch", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_size = batch_state.shape[0] with torch.no_grad(): batch_values = self.value_net(batch_state) batch_advantages, batch_returns = estimate_advantages( batch_reward, batch_mask, batch_values, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) mini_batch_num = int(math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ ind], batch_action[ind], batch_returns[ ind], batch_advantages[ind], batch_log_prob[ind] v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) return v_loss, p_loss
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "ppo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) batch_log_prob = DOUBLE(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) for _ in range(self.ppo_epochs): v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) self.policy_net_old.load_state_dict(self.policy_net.state_dict()) return v_loss, p_loss