def train(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): """Trains the model for a fixed number of epochs (iterations on a dataset). Args: x_train: list of training model. y_train: list of training target (label) model. x_valid: list of validation model. y_valid: list of validation target (label) model. batch_size: Integer. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. epochs: Integer. Number of epochs to train the model. verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. callbacks: List of `keras.callbacks.Callback` instances. List of callbacks to apply during training. shuffle: Boolean (whether to shuffle the training model before each epoch). `shuffle` will default to True. """ batcher = Batcher(x_train, y_train, batch_size, self._preprocessor.transform) if x_valid and y_valid: valid_seq = Batcher(x_valid, y_valid, batch_size, self._preprocessor.transform) f1 = F1score(valid_seq, preprocessor=self._preprocessor) callbacks = [f1] + callbacks if callbacks else [f1] self._model.fit_generator(generator=batcher, epochs=epochs, callbacks=callbacks, verbose=verbose, shuffle=shuffle) if x_valid and y_valid: self.best_model = f1.get_best_model() self.best_model_report = f1.get_best_model_report()
def get_batcher(self, data_loader): batcher = Batcher(data_loader, height=self.args.height, width=self.args.width, device=torch.device(self.args.device), binarize=self.args.binarize, num_classes=10, onehot=True) return batcher
def ppo_update(args, policy, optimizer, processed_rollout): # Create batch states, actions, log_probs_old, returns, advantages = list( map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout))) # Normalize advantages advantages = (advantages - advantages.mean()) / advantages.std() memory_size = int(states.shape[0]) batcher = Batcher(memory_size // args.batch_size, [np.arange(memory_size)]) for _ in range(args.ppo_epochs): batcher.shuffle() while not batcher.end(): b = batcher.next_batch()[0] b = torch.Tensor(b).long() _, log_probs, entropy_loss, values = policy(states[b], actions[b]) ratio = (log_probs - log_probs_old[b]).exp() # pnew / pold surr1 = ratio * advantages[b] surr2 = torch.clamp(surr1, 1.0 - args.ppo_clip, 1.0 + args.ppo_clip) * advantages[b] policy_surr = -torch.min(surr1, surr2).mean() - ( entropy_loss.to(args.device) * args.entropy_coefficent).mean() value_loss = 0.5 * (returns[b] - values).pow(2.).mean() optimizer.zero_grad() (policy_surr + value_loss).backward() nn.utils.clip_grad_norm_(policy.parameters(), 5) optimizer.step() return optimizer, policy
def step(self): config = self.config rollout = [] states = self.states for _ in range(config.rollout_length): actions, log_probs, _, values = self.actor_critic(states) env_info = self.env.step( actions.cpu().detach().numpy())[self.brain_name] next_states = env_info.vector_observations # get the next state rewards = np.array(env_info.rewards) # get the reward terminals = np.array( env_info.local_done) # see if episode has finished self.online_rewards += rewards rewards = config.reward_normalizer(rewards) for i, terminal in enumerate(terminals): if terminals[i]: self.episode_rewards.append(self.online_rewards[i]) self.online_rewards[i] = 0 next_states = config.state_normalizer(next_states) rollout.append([ states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - terminals ]) states = next_states self.states = states pending_value = self.actor_critic(states)[-1] rollout.append([states, pending_value, None, None, None, None]) processed_rollout = [None] * (len(rollout) - 1) advantages = tensor(np.zeros((config.num_workers, 1))) returns = pending_value.detach() for i in reversed(range(len(rollout) - 1)): states, value, actions, log_probs, rewards, terminals = rollout[i] terminals = tensor(terminals).unsqueeze(1) rewards = tensor(rewards).unsqueeze(1) actions = tensor(actions) states = tensor(states) next_value = rollout[i + 1][1] returns = rewards + config.discount * terminals * returns if not config.use_gae: advantages = returns - value.detach() else: td_error = rewards + config.discount * terminals * next_value.detach( ) - value.detach() advantages = advantages * config.gae_tau * config.discount * terminals + td_error processed_rollout[i] = [ states, actions, log_probs, returns, advantages ] states, actions, log_probs_old, returns, advantages = map( lambda x: torch.cat(x, dim=0), zip(*processed_rollout)) # Normalize advantages advantages = (advantages - advantages.mean()) / advantages.std() batcher = Batcher( states.size(0) // config.num_mini_batches, [np.arange(states.size(0))]) for _ in range(config.optimization_epochs): batcher.shuffle() while not batcher.end(): batch_indices = batcher.next_batch()[0] batch_indices = tensor(batch_indices).long() sampled_states = states[batch_indices] sampled_actions = actions[batch_indices] sampled_log_probs_old = log_probs_old[batch_indices] sampled_returns = returns[batch_indices] sampled_advantages = advantages[batch_indices] _, new_log_probs, entropy_loss, values = self.actor_critic( sampled_states, sampled_actions) # critic training value_loss = 0.5 * F.mse_loss(sampled_returns, values) self.opt_crt.zero_grad() value_loss.backward() self.opt_crt.step() # actor training ratio = (new_log_probs - sampled_log_probs_old).exp() obj = ratio * sampled_advantages obj_clipped = ratio.clamp( 1.0 - self.config.ppo_ratio_clip, 1.0 + self.config.ppo_ratio_clip) * sampled_advantages policy_loss = -torch.min(obj, obj_clipped).mean( 0) - config.entropy_weight * entropy_loss.mean() self.opt_act.zero_grad() policy_loss.backward() self.opt_act.step() ''' self.opt.zero_grad() (policy_loss + value_loss).backward() nn.utils.clip_grad_norm_(self.actor_critic.parameters(), config.gradient_clip) self.opt.step() ''' steps = config.rollout_length * config.num_workers self.total_steps += steps
def train_on_data(self, data_batch: DataBatch, step: int = 0, writer: Optional[SummaryWriter] = None) -> Dict[str, float]: """ Performs a single update step with PPO on the given batch of data. Args: data_batch: DataBatch, dictionary step: writer: Returns: """ metrics = {} timer = Timer() entropy_coeff = self.config["entropy_coeff"] agent = self.agent optimizer = self.optimizer agent_batch = data_batch ####################################### Unpack and prepare the data ####################################### if self.config["use_gpu"]: agent_batch = batch_to_gpu(agent_batch) agent.cuda() # Initialize metrics kl_divergence = 0. ppo_step = -1 value_loss = torch.tensor(0) policy_loss = torch.tensor(0) loss = torch.tensor(0) batcher = Batcher(agent_batch['dones'].size(0) // self.config["minibatches"], [np.arange(agent_batch['dones'].size(0))]) # Start a timer timer.checkpoint() for ppo_step in range(self.config["ppo_steps"]): batcher.shuffle() # for indices, agent_minibatch in minibatches(agent_batch, self.config["batch_size"], shuffle=True): while not batcher.end(): batch_indices = batcher.next_batch()[0] batch_indices = torch.tensor(batch_indices).long() agent_minibatch = index_data(agent_batch, batch_indices) # Evaluate again after the PPO step, for new values and gradients logprob_batch, value_batch, entropy_batch = agent.evaluate_actions(agent_minibatch) advantages_batch = agent_minibatch['advantages'] old_logprobs_minibatch = agent_minibatch['logprobs'] # logprobs of taken actions discounted_batch = agent_minibatch['rewards_to_go'] ######################################### Compute the loss ############################################# # Surrogate loss prob_ratio = torch.exp(logprob_batch - old_logprobs_minibatch) surr1 = prob_ratio * advantages_batch surr2 = prob_ratio.clamp(1. - self.eps, 1 + self.eps) * advantages_batch # surr2 = torch.where(advantages_batch > 0, # (1. + self.eps) * advantages_batch, # (1. - self.eps) * advantages_batch) policy_loss = -torch.min(surr1, surr2) value_loss = 0.5 * (value_batch - discounted_batch) ** 2 # import pdb; pdb.set_trace() loss = (torch.mean(policy_loss) + (self.config["value_loss_coeff"] * torch.mean(value_loss)) - (entropy_coeff * torch.mean(entropy_batch))) ############################################# Update step ############################################## optimizer.zero_grad() loss.backward() if self.config["max_grad_norm"] is not None: nn.utils.clip_grad_norm_(agent.model.parameters(), self.config["max_grad_norm"]) optimizer.step() # logprob_batch, value_batch, entropy_batch = agent.evaluate_actions(agent_batch) # # kl_divergence = torch.mean(old_logprobs_batch - logprob_batch).item() # if abs(kl_divergence) > self.config["target_kl"]: # break agent.cpu() # Training-related metrics metrics[f"agent/time_update"] = timer.checkpoint() metrics[f"agent/kl_divergence"] = kl_divergence metrics[f"agent/ppo_steps_made"] = ppo_step + 1 metrics[f"agent/policy_loss"] = torch.mean(policy_loss).cpu().item() metrics[f"agent/value_loss"] = torch.mean(value_loss).cpu().item() metrics[f"agent/total_loss"] = loss.detach().cpu().item() metrics[f"agent/rewards"] = agent_batch['rewards'].cpu().sum().item() metrics[f"agent/mean_std"] = agent.model.std.mean().item() # Other metrics # metrics[f"agent/mean_entropy"] = torch.mean(entropy_batch).item() # Write the metrics to tensorboard write_dict(metrics, step, writer) return metrics