def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("total reward", log['total_reward'], i_iter) writer.add_scalar("average reward", log['avg_reward'], i_iter) writer.add_scalar("min reward", log['min_episode_reward'], i_iter) writer.add_scalar("max reward", log['max_episode_reward'], i_iter) writer.add_scalar("num steps", log['num_steps'], i_iter) batch = memory.sample() # sample all items in memory batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) with torch.no_grad(): batch_value = self.ac_net.get_value(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) alg_step_stats = a2c_step(self.ac_net, self.optimizer_ac, batch_state, batch_action, batch_return, batch_advantage, self.value_net_coeff, self.entropy_coeff) return alg_step_stats
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("total reward", log['total_reward'], i_iter) writer.add_scalar("average reward", log['avg_reward'], i_iter) writer.add_scalar("min reward", log['min_episode_reward'], i_iter) writer.add_scalar("max reward", log['max_episode_reward'], i_iter) writer.add_scalar("num steps", log['num_steps'], i_iter) batch = memory.sample() # sample all items in memory batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) # update by TRPO trpo_step(self.policy_net, self.value_net, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.max_kl, self.damping, 1e-3, None)
def train(memory): batch = memory.sample() batch_states = DOUBLE(batch.state).to(device) batch_actions = DOUBLE(batch.action).to(device) batch_log_probs = DOUBLE(batch.log_prob).to(device) batch_masks = DOUBLE(batch.mask).to(device) batch_rewards = DOUBLE(batch.reward).to(device) batch_size = batch_states.shape[0] with torch.no_grad(): batch_values = critic(batch_states) batch_advantages, batch_returns = estimate_advantages(batch_rewards, batch_masks, batch_values, gamma, tau) # mini-batch ppo update mini_batch_num = int(math.ceil(batch_size / mini_batch_size)) for _ in range(ppo_epochs): idx = torch.randperm(batch_size) for i in range(mini_batch_num): mini_batch_idx = idx[i * mini_batch_size: min((i + 1) * mini_batch_size, batch_size)] mini_batch_states, mini_batch_actions, mini_batch_log_probs, mini_batch_returns, mini_batch_advantages = \ batch_states[mini_batch_idx], batch_actions[mini_batch_idx], batch_log_probs[mini_batch_idx], \ batch_returns[mini_batch_idx], batch_advantages[mini_batch_idx] ppo_step(actor, critic, opt_p, opt_v, 1, mini_batch_states, mini_batch_actions, mini_batch_returns, mini_batch_advantages, mini_batch_log_probs, epsilon, 1e-3)
def train(self, memory): batch = memory.sample() batch_states = FLOAT(batch.state).to(device) batch_actions = FLOAT(batch.action).to(device) batch_log_probs = FLOAT(batch.log_prob).to(device) batch_masks = FLOAT(batch.mask).to(device) batch_rewards = FLOAT(batch.reward).to(device) batch_size = batch_states.shape[0] with torch.no_grad(): batch_values = self.value(batch_states) batch_advantages, batch_returns = estimate_advantages( batch_rewards, batch_masks, batch_values, self.gamma, self.tau) # mini-batch ppo update mini_batch_num = int(math.ceil(batch_size / self.mini_batch_size)) for _ in range(self.ppo_epochs): idx = torch.randperm(batch_size) for i in range(mini_batch_num): mini_batch_idx = idx[i * self.mini_batch_size:min( (i + 1) * self.mini_batch_size, batch_size)] mini_batch_states, mini_batch_actions, mini_batch_log_probs, mini_batch_returns, mini_batch_advantages = \ batch_states[mini_batch_idx], batch_actions[mini_batch_idx], batch_log_probs[mini_batch_idx], \ batch_returns[mini_batch_idx], batch_advantages[mini_batch_idx] self.ppo_step(mini_batch_states, mini_batch_actions, mini_batch_returns, mini_batch_advantages, mini_batch_log_probs, self.epsilon, 1e-3)
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "PPO_mini_batch", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_size = batch_state.shape[0] with torch.no_grad(): batch_values = self.value_net(batch_state) batch_advantages, batch_returns = estimate_advantages( batch_reward, batch_mask, batch_values, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) mini_batch_num = int(math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ ind], batch_action[ind], batch_returns[ ind], batch_advantages[ind], batch_log_prob[ind] v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) return v_loss, p_loss
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "ppo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) batch_log_prob = DOUBLE(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) for _ in range(self.ppo_epochs): v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) self.policy_net_old.load_state_dict(self.policy_net.state_dict()) return v_loss, p_loss
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "vpg", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = vpg_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, self.vpg_epochs, batch_state, batch_action, batch_return, batch_advantage, 1e-3) return v_loss, p_loss
def learn(self, writer, i_iter): memory, log = self.collector.collect_samples( self.config["train"]["generator"]["sample_batch_size"]) self.policy.train() self.value.train() self.discriminator.train() print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("gail/average reward", log['avg_reward'], i_iter) writer.add_scalar("gail/num steps", log['num_steps'], i_iter) # collect generated batch # gen_batch = self.collect_samples(self.config["ppo"]["sample_batch_size"]) gen_batch = memory.sample() gen_batch_state = FLOAT(gen_batch.state).to( device) # [batch size, state size] gen_batch_action = FLOAT(gen_batch.action).to( device) # [batch size, action size] gen_batch_old_log_prob = FLOAT(gen_batch.log_prob).to( device) # [batch size, 1] gen_batch_mask = FLOAT(gen_batch.mask).to(device) # [batch, 1] #################################################### # update discriminator #################################################### d_optim_i_iters = self.config["train"]["discriminator"]["optim_step"] if i_iter % d_optim_i_iters == 0: for step, (expert_batch_state, expert_batch_action) in enumerate( self.expert_dataset.train_loader): if step >= d_optim_i_iters: break # calculate probs and logits gen_prob, gen_logits = self.discriminator( gen_batch_state, gen_batch_action) expert_prob, expert_logits = self.discriminator( expert_batch_state.to(device), expert_batch_action.to(device)) # calculate accuracy gen_acc = torch.mean((gen_prob < 0.5).float()) expert_acc = torch.mean((expert_prob > 0.5).float()) # calculate regression loss expert_labels = torch.ones_like(expert_prob) gen_labels = torch.zeros_like(gen_prob) e_loss = self.discriminator_func(expert_prob, target=expert_labels) g_loss = self.discriminator_func(gen_prob, target=gen_labels) d_loss = e_loss + g_loss # calculate entropy loss logits = torch.cat([gen_logits, expert_logits], 0) entropy = ((1. - torch.sigmoid(logits)) * logits - torch.nn.functional.logsigmoid(logits)).mean() entropy_loss = - \ self.config["train"]["discriminator"]["ent_coeff"] * entropy total_loss = d_loss + entropy_loss self.optimizer_discriminator.zero_grad() total_loss.backward() self.optimizer_discriminator.step() writer.add_scalar('discriminator/d_loss', d_loss.item(), i_iter) writer.add_scalar("discriminator/e_loss", e_loss.item(), i_iter) writer.add_scalar("discriminator/g_loss", g_loss.item(), i_iter) writer.add_scalar("discriminator/ent", entropy.item(), i_iter) writer.add_scalar('discriminator/expert_acc', gen_acc.item(), i_iter) writer.add_scalar('discriminator/gen_acc', expert_acc.item(), i_iter) #################################################### # update policy by ppo [mini_batch] #################################################### with torch.no_grad(): gen_batch_value = self.value(gen_batch_state) d_out, _ = self.discriminator(gen_batch_state, gen_batch_action) gen_batch_reward = -torch.log(1 - d_out + 1e-6) gen_batch_advantage, gen_batch_return = estimate_advantages( gen_batch_reward, gen_batch_mask, gen_batch_value, self.config["train"]["generator"]["gamma"], self.config["train"]["generator"]["tau"]) ppo_optim_i_iters = self.config["train"]["generator"]["optim_step"] ppo_mini_batch_size = self.config["train"]["generator"][ "mini_batch_size"] for _ in range(ppo_optim_i_iters): if ppo_mini_batch_size > 0: gen_batch_size = gen_batch_state.shape[0] optim_iter_num = int( math.ceil(gen_batch_size / ppo_mini_batch_size)) perm = torch.randperm(gen_batch_size) for i in range(optim_iter_num): ind = perm[slice( i * ppo_mini_batch_size, min((i + 1) * ppo_mini_batch_size, gen_batch_size))] mini_batch_state, mini_batch_action, mini_batch_advantage, mini_batch_return, \ mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], \ gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[ ind] v_loss, p_loss, ent_loss = ppo_step( policy_net=self.policy, value_net=self.value, optimizer_policy=self.optimizer_policy, optimizer_value=self.optimizer_value, optim_value_iternum=self.config["value"] ["optim_value_iter"], states=mini_batch_state, actions=mini_batch_action, returns=mini_batch_return, old_log_probs=mini_batch_old_log_prob, advantages=mini_batch_advantage, clip_epsilon=self.config["train"]["generator"] ["clip_ratio"], l2_reg=self.config["value"]["l2_reg"]) else: v_loss, p_loss, ent_loss = ppo_step( policy_net=self.policy, value_net=self.value, optimizer_policy=self.optimizer_policy, optimizer_value=self.optimizer_value, optim_value_iternum=self.config["value"] ["optim_value_iter"], states=gen_batch_state, actions=gen_batch_action, returns=gen_batch_return, old_log_probs=gen_batch_old_log_prob, advantages=gen_batch_advantage, clip_epsilon=self.config["train"]["generator"] ["clip_ratio"], l2_reg=self.config["value"]["l2_reg"]) writer.add_scalar('generator/p_loss', p_loss, i_iter) writer.add_scalar('generator/v_loss', v_loss, i_iter) writer.add_scalar('generator/ent_loss', ent_loss, i_iter) print(f" Training episode:{i_iter} ".center(80, "#")) print('d_gen_prob:', gen_prob.mean().item()) print('d_expert_prob:', expert_prob.mean().item()) print('d_loss:', d_loss.item()) print('e_loss:', e_loss.item()) print("d/bernoulli_entropy:", entropy.item())
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("total reward", log['total_reward'], i_iter) writer.add_scalar("average reward", log['avg_reward'], i_iter) writer.add_scalar("min reward", log['min_episode_reward'], i_iter) writer.add_scalar("max reward", log['max_episode_reward'], i_iter) writer.add_scalar("num steps", log['num_steps'], i_iter) batch = memory.sample() # sample all items in memory # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) alg_step_stats = {} if self.ppo_mini_batch_size: batch_size = batch_state.shape[0] mini_batch_num = int( math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \ batch_return[ ind], batch_advantage[ind], \ batch_log_prob[ ind] alg_step_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) else: for _ in range(self.ppo_epochs): alg_step_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) return alg_step_stats