def _input_grad(self) -> ndarray:

        # if "single_class", "un-normalize" probabilities before returning gradient:
        if self.single_class:
            return unnormalize(self.softmax_preds - self.target)
        else:
            return (self.softmax_preds - self.target) / self.prediction.shape[0]
    def _input_grad(self) -> ndarray:

        prob_grads = []
        batch_size = self.softmax_preds.shape[0]
        num_features = self.softmax_preds.shape[1]
        for n in range(batch_size):
            exp_ratio = exp_ratios(self.prediction[n] - np.max(self.prediction[n]))
            jacobian = np.zeros((num_features, num_features))
            for f1 in range(num_features):  # p index
                for f2 in range(num_features):  # SCE index
                    if f1 == f2:
                        jacobian[f1][f2] = (
                            self.softmax_preds[n][f1] - self.target[n][f1])
                    else:
                        jacobian[f1][f2] = (
                            -(self.target[n][f2]-1) * exp_ratio[f1][f2] + self.target[n][f2] + self.softmax_preds[n][f1] - 1)
            prob_grads.append(jacobian.sum(axis=1))

        if self.single_class:
            return unnormalize(np.stack(prob_grads))
        else:
            return np.stack(prob_grads)