def ppo_update(args, policy, optimizer, processed_rollout):
    # Create batch
    states, actions, log_probs_old, returns, advantages = list(
        map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout)))
    # Normalize advantages
    advantages = (advantages - advantages.mean()) / advantages.std()

    memory_size = int(states.shape[0])
    batcher = Batcher(memory_size // args.batch_size, [np.arange(memory_size)])
    for _ in range(args.ppo_epochs):
        batcher.shuffle()
        while not batcher.end():
            b = batcher.next_batch()[0]
            b = torch.Tensor(b).long()

            _, log_probs, entropy_loss, values = policy(states[b], actions[b])
            ratio = (log_probs - log_probs_old[b]).exp()  # pnew / pold
            surr1 = ratio * advantages[b]
            surr2 = torch.clamp(surr1, 1.0 - args.ppo_clip,
                                1.0 + args.ppo_clip) * advantages[b]
            policy_surr = -torch.min(surr1, surr2).mean() - (
                entropy_loss.to(args.device) * args.entropy_coefficent).mean()

            value_loss = 0.5 * (returns[b] - values).pow(2.).mean()
            optimizer.zero_grad()
            (policy_surr + value_loss).backward()
            nn.utils.clip_grad_norm_(policy.parameters(), 5)
            optimizer.step()

    return optimizer, policy
Ejemplo n.º 2
0
    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              epochs=1,
              batch_size=32,
              verbose=1,
              callbacks=None,
              shuffle=True):
        """Trains the model for a fixed number of epochs (iterations on a dataset).
        Args:
            x_train: list of training model.
            y_train: list of training target (label) model.
            x_valid: list of validation model.
            y_valid: list of validation target (label) model.
            batch_size: Integer.
                Number of samples per gradient update.
                If unspecified, `batch_size` will default to 32.
            epochs: Integer. Number of epochs to train the model.
            verbose: Integer. 0, 1, or 2. Verbosity mode.
                0 = silent, 1 = progress bar, 2 = one line per epoch.
            callbacks: List of `keras.callbacks.Callback` instances.
                List of callbacks to apply during training.
            shuffle: Boolean (whether to shuffle the training model
                before each epoch). `shuffle` will default to True.
        """

        batcher = Batcher(x_train, y_train, batch_size,
                          self._preprocessor.transform)

        if x_valid and y_valid:
            valid_seq = Batcher(x_valid, y_valid, batch_size,
                                self._preprocessor.transform)
            f1 = F1score(valid_seq, preprocessor=self._preprocessor)
            callbacks = [f1] + callbacks if callbacks else [f1]

        self._model.fit_generator(generator=batcher,
                                  epochs=epochs,
                                  callbacks=callbacks,
                                  verbose=verbose,
                                  shuffle=shuffle)

        if x_valid and y_valid:
            self.best_model = f1.get_best_model()
            self.best_model_report = f1.get_best_model_report()
Ejemplo n.º 3
0
 def get_batcher(self, data_loader):
     batcher = Batcher(data_loader,
                       height=self.args.height,
                       width=self.args.width,
                       device=torch.device(self.args.device),
                       binarize=self.args.binarize,
                       num_classes=10,
                       onehot=True)
     return batcher
Ejemplo n.º 4
0
    def step(self):
        config = self.config
        rollout = []
        states = self.states
        for _ in range(config.rollout_length):
            actions, log_probs, _, values = self.actor_critic(states)

            env_info = self.env.step(
                actions.cpu().detach().numpy())[self.brain_name]
            next_states = env_info.vector_observations  # get the next state
            rewards = np.array(env_info.rewards)  # get the reward
            terminals = np.array(
                env_info.local_done)  # see if episode has finished

            self.online_rewards += rewards
            rewards = config.reward_normalizer(rewards)
            for i, terminal in enumerate(terminals):
                if terminals[i]:
                    self.episode_rewards.append(self.online_rewards[i])
                    self.online_rewards[i] = 0

            next_states = config.state_normalizer(next_states)
            rollout.append([
                states,
                values.detach(),
                actions.detach(),
                log_probs.detach(), rewards, 1 - terminals
            ])
            states = next_states

        self.states = states
        pending_value = self.actor_critic(states)[-1]
        rollout.append([states, pending_value, None, None, None, None])

        processed_rollout = [None] * (len(rollout) - 1)
        advantages = tensor(np.zeros((config.num_workers, 1)))
        returns = pending_value.detach()

        for i in reversed(range(len(rollout) - 1)):
            states, value, actions, log_probs, rewards, terminals = rollout[i]
            terminals = tensor(terminals).unsqueeze(1)
            rewards = tensor(rewards).unsqueeze(1)
            actions = tensor(actions)
            states = tensor(states)
            next_value = rollout[i + 1][1]
            returns = rewards + config.discount * terminals * returns
            if not config.use_gae:
                advantages = returns - value.detach()
            else:
                td_error = rewards + config.discount * terminals * next_value.detach(
                ) - value.detach()
                advantages = advantages * config.gae_tau * config.discount * terminals + td_error
            processed_rollout[i] = [
                states, actions, log_probs, returns, advantages
            ]

        states, actions, log_probs_old, returns, advantages = map(
            lambda x: torch.cat(x, dim=0), zip(*processed_rollout))

        # Normalize advantages
        advantages = (advantages - advantages.mean()) / advantages.std()

        batcher = Batcher(
            states.size(0) // config.num_mini_batches,
            [np.arange(states.size(0))])
        for _ in range(config.optimization_epochs):
            batcher.shuffle()
            while not batcher.end():
                batch_indices = batcher.next_batch()[0]
                batch_indices = tensor(batch_indices).long()
                sampled_states = states[batch_indices]
                sampled_actions = actions[batch_indices]
                sampled_log_probs_old = log_probs_old[batch_indices]
                sampled_returns = returns[batch_indices]
                sampled_advantages = advantages[batch_indices]

                _, new_log_probs, entropy_loss, values = self.actor_critic(
                    sampled_states, sampled_actions)

                # critic training
                value_loss = 0.5 * F.mse_loss(sampled_returns, values)

                self.opt_crt.zero_grad()
                value_loss.backward()
                self.opt_crt.step()

                # actor training
                ratio = (new_log_probs - sampled_log_probs_old).exp()
                obj = ratio * sampled_advantages
                obj_clipped = ratio.clamp(
                    1.0 - self.config.ppo_ratio_clip,
                    1.0 + self.config.ppo_ratio_clip) * sampled_advantages

                policy_loss = -torch.min(obj, obj_clipped).mean(
                    0) - config.entropy_weight * entropy_loss.mean()

                self.opt_act.zero_grad()
                policy_loss.backward()
                self.opt_act.step()
                '''
                self.opt.zero_grad()
                (policy_loss + value_loss).backward()
                nn.utils.clip_grad_norm_(self.actor_critic.parameters(), config.gradient_clip)
                self.opt.step()
                '''

        steps = config.rollout_length * config.num_workers
        self.total_steps += steps
    def train_on_data(self, data_batch: DataBatch,
                      step: int = 0,
                      writer: Optional[SummaryWriter] = None) -> Dict[str, float]:
        """
        Performs a single update step with PPO on the given batch of data.

        Args:
            data_batch: DataBatch, dictionary
            step:
            writer:

        Returns:

        """
        metrics = {}
        timer = Timer()

        entropy_coeff = self.config["entropy_coeff"]

        agent = self.agent
        optimizer = self.optimizer

        agent_batch = data_batch

        ####################################### Unpack and prepare the data #######################################

        if self.config["use_gpu"]:
            agent_batch = batch_to_gpu(agent_batch)
            agent.cuda()

        # Initialize metrics
        kl_divergence = 0.
        ppo_step = -1
        value_loss = torch.tensor(0)
        policy_loss = torch.tensor(0)
        loss = torch.tensor(0)

        batcher = Batcher(agent_batch['dones'].size(0) // self.config["minibatches"],
                          [np.arange(agent_batch['dones'].size(0))])

        # Start a timer
        timer.checkpoint()

        for ppo_step in range(self.config["ppo_steps"]):
            batcher.shuffle()

            # for indices, agent_minibatch in minibatches(agent_batch, self.config["batch_size"], shuffle=True):
            while not batcher.end():
                batch_indices = batcher.next_batch()[0]
                batch_indices = torch.tensor(batch_indices).long()

                agent_minibatch = index_data(agent_batch, batch_indices)
                # Evaluate again after the PPO step, for new values and gradients
                logprob_batch, value_batch, entropy_batch = agent.evaluate_actions(agent_minibatch)

                advantages_batch = agent_minibatch['advantages']
                old_logprobs_minibatch = agent_minibatch['logprobs']  # logprobs of taken actions
                discounted_batch = agent_minibatch['rewards_to_go']

                ######################################### Compute the loss #############################################
                # Surrogate loss
                prob_ratio = torch.exp(logprob_batch - old_logprobs_minibatch)
                surr1 = prob_ratio * advantages_batch
                surr2 = prob_ratio.clamp(1. - self.eps, 1 + self.eps) * advantages_batch
                # surr2 = torch.where(advantages_batch > 0,
                #                     (1. + self.eps) * advantages_batch,
                #                     (1. - self.eps) * advantages_batch)

                policy_loss = -torch.min(surr1, surr2)
                value_loss = 0.5 * (value_batch - discounted_batch) ** 2
                # import pdb; pdb.set_trace()
                loss = (torch.mean(policy_loss)
                        + (self.config["value_loss_coeff"] * torch.mean(value_loss))
                        - (entropy_coeff * torch.mean(entropy_batch)))

                ############################################# Update step ##############################################
                optimizer.zero_grad()
                loss.backward()
                if self.config["max_grad_norm"] is not None:
                    nn.utils.clip_grad_norm_(agent.model.parameters(), self.config["max_grad_norm"])
                optimizer.step()

            # logprob_batch, value_batch, entropy_batch = agent.evaluate_actions(agent_batch)
            #
            # kl_divergence = torch.mean(old_logprobs_batch - logprob_batch).item()
            # if abs(kl_divergence) > self.config["target_kl"]:
            #     break

        agent.cpu()

        # Training-related metrics
        metrics[f"agent/time_update"] = timer.checkpoint()
        metrics[f"agent/kl_divergence"] = kl_divergence
        metrics[f"agent/ppo_steps_made"] = ppo_step + 1
        metrics[f"agent/policy_loss"] = torch.mean(policy_loss).cpu().item()
        metrics[f"agent/value_loss"] = torch.mean(value_loss).cpu().item()
        metrics[f"agent/total_loss"] = loss.detach().cpu().item()
        metrics[f"agent/rewards"] = agent_batch['rewards'].cpu().sum().item()
        metrics[f"agent/mean_std"] = agent.model.std.mean().item()

        # Other metrics
        # metrics[f"agent/mean_entropy"] = torch.mean(entropy_batch).item()

        # Write the metrics to tensorboard
        write_dict(metrics, step, writer)

        return metrics