def online_policy_update(self, board, legal_moves, logprob):
        new_value = self.model(config.make_variable([board]),
                               config.make_variable([legal_moves]))[1].data[0,
                                                                            0]
        reward = self.state_values[-1].data[0, 0] - new_value
        loss = -logprob * reward

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        self.optimizer.step()
    def evaluate(self, board_sample, legal_moves_map):
        input = config.make_variable([board_sample])
        legal_moves_map = config.make_variable(legal_moves_map)
        probs, _ = self.model(input, legal_moves_map)

        distribution = Categorical(probs)
        action = distribution.sample()

        move = (action // config.BOARD_SIZE, action % config.BOARD_SIZE)
        log_prob = distribution.log_prob(action)
        if self.train:
            self.log_probs.append(log_prob)
        return move
    def update(self):
        # ---------------------- Error Logging ---------------------- #
        if not self.train:
            return None

        if len(self.log_probs) != len(self.rewards) or len(
                self.log_probs) != len(self.state_values):
            raise PlayerException(
                "log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s"
                % (len(self.log_probs), len(
                    self.rewards), len(self.state_values)))
        # ----------------------------------------------------------- #

        # Bootstrapping
        rewards = self.bootstrap_rewards()
        rewards = config.make_variable(rewards)
        # rewards = self.normalize_rewards(rewards)  # For now nothing to normalize, standard deviation = 0

        if self.online:
            loss = calculate_online_loss(self.state_values, rewards)
        else:
            loss = calculate_loss(self.log_probs, self.state_values, rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]
        del self.state_values[:]
        del self.board_samples[:]
        del self.legal_moves[:]

        return abs(loss.data)
    def update(self):
        if not self.train:
            return None

        if len(self.log_probs) != len(self.rewards):
            raise abstract.PlayerException(
                "log_probs length must be equal to rewards length. Got %s - %s"
                % (len(self.log_probs), len(self.rewards)))

        rewards = self.discount_rewards(self.rewards, self.gamma)
        rewards = config.make_variable(rewards)
        # rewards = self.normalize_rewards(rewards)  # For now nothing to normalize, standard deviation = 0

        policy_losses = [(-log_prob * reward)
                         for log_prob, reward in zip(self.log_probs, rewards)]

        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_losses).sum() / len(policy_losses)
        policy_loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]

        return abs(policy_loss.data)
    def update(self):
        # ---------------------- Error Logging ---------------------- #
        if not self.train:
            return None

        if len(self.log_probs) != len(self.rewards) or len(
                self.log_probs) != len(self.state_values):
            raise PlayerException(
                "log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s"
                % (len(self.log_probs), len(
                    self.rewards), len(self.state_values)))

        # ----------------------------------------------------------- #

        rewards = self.discount_rewards(self.rewards, self.gamma)
        rewards = self.rewards_baseline(rewards)
        rewards = config.make_variable(rewards)

        loss = self.calculate_loss(self.log_probs, self.state_values, rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]
        del self.state_values[:]
        del self.board_samples[:]
        del self.legal_moves[:]

        return abs(loss.data)
    def bootstrap_rewards(self):
        # TODO: Catch illegal use of this method
        pred_values = [
            self.model(config.make_variable([self.board_samples[i]]),
                       config.make_variable([self.legal_moves[i]]))[1].data[0,
                                                                            0]
            for i in range(len(self.board_samples))
        ]
        pred_values[-1] = self.rewards[-1]

        rewards = [
            pred_values[i + 1] - pred_values[i]
            for i in range(len(pred_values) - 1)
        ]
        rewards.append(self.rewards[-1])

        return rewards
    def evaluate(self, board_sample, legal_moves_map):
        input = config.make_variable([board_sample])

        probs, state_value = self.model(input,
                                        config.make_variable(legal_moves_map))
        distribution = Categorical(probs)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)

        move = (action // config.BOARD_SIZE, action % config.BOARD_SIZE)
        if self.train:
            if self.online and self.state_values:
                self.online_policy_update(board_sample, legal_moves_map,
                                          log_prob)

            self.log_probs.append(log_prob)
            self.state_values.append(state_value[0])
            self.board_samples.append(board_sample)
            self.legal_moves.append(legal_moves_map)
        return move
    def evaluate(self, board_sample, legal_moves_map):
        input = config.make_variable([board_sample])

        probs, state_value = self.model(input,
                                        config.make_variable(legal_moves_map))

        try:  # Hacky way of ensuring nonzero distribution
            distribution = Categorical(probs)
            action = distribution.sample()
            move = (action // config.BOARD_SIZE, action % config.BOARD_SIZE)
        except RuntimeError:
            print("Probs: \n%s \nBoard: \n%s \nLegal moves: \n%s" %
                  (probs, board_sample, legal_moves_map))

        if self.train:
            self.log_probs.append(distribution.log_prob(action))
            self.state_values.append(state_value[0][0])
            self.board_samples.append(board_sample)
            self.legal_moves.append(legal_moves_map)
        return move