def _compute_log_posterior_odds_ratio(self, k, n): """ Function computing log-posterior odds ratio for class assignment for new example with parameters k and n: .. math:: F(k, n) = log \\, p (c=1|k,n) - log \\, p(c=0|k,n)) = log (N_1 + 1) - log(N_0 + 1) + log \\, B(\\alpha_0, \\beta_0) - log \\, B(\\alpha_1, \\beta_1) + log \\, B(k + \\alpha_1, n - k + \\beta_1) - log \\, B(k + \\alpha_0, n-k + \\beta_0) Arguments: k: number of disease-associated sequences n: total number of sequences Returns: log-posterior odds ratio for class assignment """ return np.log(self.N_1 + 1) - np.log(self.N_0 + 1) \ + beta_func_ln(self.alpha_0, self.beta_0) - beta_func_ln(self.alpha_1, self.beta_1) \ + beta_func_ln(k + self.alpha_1, n - k + self.beta_1) \ - beta_func_ln(k + self.alpha_0, n - k + self.beta_0)
def _find_beta_distribution_parameters(self, X, N_l: int) -> Tuple[float, float]: """ Function implementing gradient ascent to find parameters of the beta distribution for the given class. It maximizes the following log-likelihood: .. math:: l_l (\\alpha, \\beta) = - N_l \\, log \\, B (\\alpha, \\beta) + \\sum_{i: c_i = l} log \\, B(k_i + \\alpha, n_i - k_i + \\beta), l = 0, 1 Arguments: X: design matrix of shape [number of examples x number of features], where number of features is 2 (the first feature is the number of disease-associated sequences and the second is the total number of sequences per example) N_l: number of examples in the given class Returns: estimated values of alpha and beta for the given class """ k_is, n_is = X[:, 0], X[:, 1] alpha, beta = self._initialize_beta_distribution_parameters(k_is, n_is) k_is, n_is = self._perform_laplace_smoothing(k_is, n_is) for iteration in range(self.max_iterations): log_likelihood = -N_l * beta_func(alpha, beta) + np.sum( beta_func_ln(k_is + alpha, n_is - k_is + beta)) if np.isnan(log_likelihood): raise RuntimeError( f"ProbabilisticBinaryClassifier: while estimating beta distribution parameters, " f"log_likelihood became nan in iteration {iteration}. \nalpha: {alpha}, beta: {beta}" ) elif log_likelihood > self.likelihood_threshold: break grad_alpha, grad_beta = self._compute_alpha_beta_gradients( N_l, alpha, beta, k_is, n_is) alpha = max(alpha + self.update_rate * grad_alpha, ProbabilisticBinaryClassifier.SMALL_POSITIVE_NUMBER) beta = max(beta + self.update_rate * grad_beta, ProbabilisticBinaryClassifier.SMALL_POSITIVE_NUMBER) return alpha, beta