def test_ll(): """Tests that the log-likelihood for generalized linear models is correctly calculated.""" # identity y_true = np.array([1, 2, 3]) y_pred = np.array([np.e + 1, np.e + 2, np.e + 3]) ll = log_likelihood_glm('normal', y_true, y_pred) assert_almost_equal(ll, -4.5) # poisson y_true = np.array([1 / np.log(2.), 1 / np.log(3.), 1 / np.log(4.)]) y_pred = np.array([2., 3., 4.]) ll = log_likelihood_glm('poisson', y_true, y_pred) assert_almost_equal(ll, -2) # poisson with all zeros y_true = np.zeros(3) y_pred = np.zeros(3) ll = log_likelihood_glm('poisson', y_true, y_pred) assert_equal(ll, 0.) # poisson with all zeros, but predicted is not all zeros y_pred = np.zeros(3) y_true = np.array([0., 0., 1.]) ll = log_likelihood_glm('poisson', y_true, y_pred) assert_equal(ll, -np.inf)
def empirical_bayes(X, y, y_pred, ssq_hat, beta): n, p = X.shape beta = beta.ravel() y = y.ravel() support = np.array(beta).astype(bool) # Paper provides closed form expression # Using the conditional marginal likelihood criterion q = np.count_nonzero(beta) ll = log_likelihood_glm('normal', y, y_pred) if q > 0: support = (beta != 0).astype(bool) Tgamma = beta[support].T @ X[:, support].T @ X[:, support] @ beta[ support] / ssq_hat R = -2 * (xlogy(p - q, p - q) + xlogy(q, q)) if np.divide(Tgamma, q) > 1: B = q + q * np.log(Tgamma) - xlogy(q, q) CCML = -2 * ll + B + R else: B = Tgamma CCML = -2 * ll + Tgamma + R return CCML, B, R else: # Do not give the opportunity to select support wiht 0 coefficients return np.inf, 0, 0
def GIC(y, y_pred, model_size, penalty): y = y.ravel() y_pred = y_pred.ravel() ll = log_likelihood_glm('normal', y, y_pred) return -2 * ll + penalty * model_size
def score_predictions(self, metric, fitter, X, y, support): """Score, according to some metric, predictions provided by a model. The resulting score will be negated if an information criterion is specified. Parameters ---------- metric : string The type of score to run on the prediction. Valid options include 'r2' (explained variance), 'BIC' (Bayesian information criterion), 'AIC' (Akaike information criterion), and 'AICc' (corrected AIC). fitter : Poisson object The Poisson object that has been fit to the data with the respective hyperparameters. X : nd-array The design matrix. y : nd-array The response vector. support: array-like The indices of the non-zero features. Returns ------- score : float The score. """ # for Poisson, use predict_mean to calculate the "predicted" values y_pred = fitter.predict_mean(X[:, support]) # calculate the log-likelihood ll = utils.log_likelihood_glm(model='poisson', y_true=y, y_pred=y_pred) if metric == 'log': score = ll # information criteria else: n_features = np.count_nonzero(support) if fitter.intercept_ != 0: n_features += 1 n_samples = y.size if metric == 'BIC': score = utils.BIC(ll, n_features, n_samples) elif metric == 'AIC': score = utils.AIC(ll, n_features) elif metric == 'AICc': score = utils.AICc(ll, n_features, n_samples) else: raise ValueError(metric + ' is not a valid metric.') # negate the score since lower information criterion is # preferable score = -score return score
def test_LinearRegressor_scoring_defaults(): """Tests that the correct default train/test data are being used for scoring estimates in UoIAbstractLinearRegressor. Further tests that the scoring itself is being done correctly.""" seed = 5 X, y = make_regression(n_samples=100, n_features=10, n_informative=10, random_state=seed) train_idxs, test_idxs = train_test_split(np.arange(X.shape[0]), test_size=0.1, random_state=seed) X_train = X[train_idxs] y_train = y[train_idxs] X_test = X[test_idxs] y_test = y[test_idxs] fitter = LinearRegression().fit(X_train, y_train) support = np.ones(X.shape[1]).astype(bool) # r2 - must use test data uoi = UoI_Lasso(estimation_score='r2') assert (uoi._estimation_target == 1) score = uoi._score_predictions('r2', fitter, X, y, support, (train_idxs, test_idxs)) assert_equal(r2_score(y_test, fitter.predict(X_test)), score) ll = log_likelihood_glm('normal', y_train, fitter.predict(X_train[:, support])) # BIC - must use train data uoi = UoI_Lasso(estimation_score='BIC') assert (uoi._estimation_target == 0) score = -1 * uoi._score_predictions('BIC', fitter, X, y, support, (train_idxs, test_idxs)) assert_equal(BIC(ll, *X_train.T.shape), score) # AIC - must use train data uoi = UoI_Lasso(estimation_score='AIC') assert (uoi._estimation_target == 0) score = -1 * uoi._score_predictions('AIC', fitter, X, y, support, (train_idxs, test_idxs)) assert_equal(AIC(ll, X_train.shape[1]), score) # AICc - must use train data uoi = UoI_Lasso(estimation_score='AICc') assert (uoi._estimation_target == 0) score = -1 * uoi._score_predictions('AICc', fitter, X, y, support, (train_idxs, test_idxs)) assert_equal(AICc(ll, *X_train.T.shape), score)
def score_predictions(metric, fitter, X, y, support): """Score, according to some metric, predictions provided by a model. the resulting score will be negated if an information criterion is specified Parameters ---------- metric : string The type of score to run on the prediction. Valid options include 'r2' (explained variance), 'BIC' (Bayesian information criterion), 'AIC' (Akaike information criterion), and 'AICc' (corrected AIC). y_true : array-like The true response variables. y_pred : array-like The predicted response variables. supports: array-like The value of the supports for the model that was used to generate *y_pred*. Returns ------- score : float The score. """ y_pred = fitter.predict(X[:, support]) if metric == 'r2': score = r2_score(y, y_pred) else: ll = utils.log_likelihood_glm(model='normal', y_true=y, y_pred=y_pred) n_features = np.count_nonzero(support) n_samples = y.size if metric == 'BIC': score = utils.BIC(ll, n_features, n_samples) elif metric == 'AIC': score = utils.AIC(ll, n_features) elif metric == 'AICc': score = utils.AICc(ll, n_features, n_samples) else: raise ValueError(metric + ' is not a valid option.') # negate the score since lower information criterion is preferable score = -score return score
def full_bayes_factor(y, y_pred, n_features, model_size, sparsity_prior, penalty): y = y.ravel() y_pred = y_pred.ravel() n_samples = y.size # Log likelihood ll = log_likelihood_glm('normal', y, y_pred) # Regularization Penalty (prior) p1 = 2 * penalty * model_size # Normal BIC penalty BIC = model_size * np.log(n_samples) # Second order Bayes factor approximation RSS = np.sum((y - y_pred)**2) BIC2 = n_samples**3 / (2 * RSS * 3) # Term arising from normalization BIC3 = model_size * np.log(2 * np.pi) # If provided with a list of sparsity estimates, we are specifying # a beta hyperprior, and need to integrate over it correspondingly if not np.isscalar(sparsity_prior): M_k = beta_binomial_model(sparsity_prior, n_features, model_size) else: if sparsity_prior == 1: sparsity_prior = 0.999 # Model probability prior M_k = scipy.special.binom(n_features, model_size) * \ sparsity_prior**model_size * (1 - sparsity_prior)**(n_features - model_size) # If the model probability evaluates to 0, set it to a very small but finite value to # avoid blowups in the log if M_k == 0: M_k = 1e-9 P_M = 2 * np.log(M_k) # bayes_factor = 2 * ll - BIC - BIC2 + BIC3 - p1 + P_M return ll, p1, BIC, BIC2, BIC3, M_k, P_M
def _score_predictions(self, metric, fitter, X, y, support, boot_idxs): """Score, according to some metric, predictions provided by a model. The resulting score will be negated if an information criterion is specified. Parameters ---------- metric : string The type of score to run on the prediction. Valid options include 'r2' (explained variance), 'BIC' (Bayesian information criterion), 'AIC' (Akaike information criterion), and 'AICc' (corrected AIC). fitter : object Must contain .predict and .predict_proba methods. X : array-like The design matrix. y : array-like Response vector. supports : array-like The value of the supports for the model boot_idxs : 2-tuple of array-like objects Tuple of (train_idxs, test_idxs) generated from a bootstrap sample. If this is specified, then the appropriate set of data will be used for evaluating scores: test data for r^2, and training data for information criteria Returns ------- score : float The score. """ # Select the data relevant for the estimation_score X = X[boot_idxs[self._estimation_target]] y = y[boot_idxs[self._estimation_target]] if y.ndim == 2: if y.shape[1] > 1: raise ValueError('y should either have shape ' + '(n_samples, ) or (n_samples, 1).') y = np.squeeze(y) elif y.ndim > 2: raise ValueError('y should either have shape ' + '(n_samples, ) or (n_samples, 1).') y_pred = fitter.predict(X[:, support]) if y.shape != y_pred.shape: raise ValueError('Targets and predictions are not the same shape.') if metric == 'r2': score = r2_score(y, y_pred) else: ll = utils.log_likelihood_glm(model='normal', y_true=y, y_pred=y_pred) n_features = np.count_nonzero(support) n_samples = X.shape[0] if metric == 'BIC': score = utils.BIC(ll, n_features, n_samples) elif metric == 'AIC': score = utils.AIC(ll, n_features) elif metric == 'AICc': score = utils.AICc(ll, n_features, n_samples) else: raise ValueError(metric + ' is not a valid option.') # negate the score since lower information criterion is preferable score = -score return score
def _score_predictions(self, metric, fitter, X, y, support, boot_idxs=None): """Score, according to some metric, predictions provided by a model. The resulting score will be negated if an information criterion is specified. Parameters ---------- metric : string The type of score to run on the prediction. Valid options include 'r2' (explained variance), 'BIC' (Bayesian information criterion), 'AIC' (Akaike information criterion), and 'AICc' (corrected AIC). fitter : Poisson object The Poisson object that has been fit to the data with the respective hyperparameters. X : ndarray, shape (n_samples, n_features) The design matrix. y : ndarray, shape (n_samples,) The response vector. support: ndarray The indices of the non-zero features. boot_idxs : 2-tuple of array-like objects Tuple of (train_idxs, test_idxs) generated from a bootstrap sample. If this is specified, then the appropriate set of data will be used for evaluating scores: test data for r^2, and training data for information criteria Returns ------- score : float The score. """ # Select the train data if boot_idxs is not None: X = X[boot_idxs[self._estimation_target]] y = y[boot_idxs[self._estimation_target]] # for Poisson, use predict_mean to calculate the "predicted" values y_pred = fitter.predict_mean(X[:, support]) # calculate the log-likelihood ll = utils.log_likelihood_glm(model='poisson', y_true=y, y_pred=y_pred) if metric == 'log': score = ll # information criteria else: n_features = np.count_nonzero(support) if fitter.intercept_ != 0: n_features += 1 n_samples = X.shape[0] if metric == 'BIC': score = utils.BIC(n_samples * ll, n_features, n_samples) elif metric == 'AIC': score = utils.AIC(n_samples * ll, n_features) elif metric == 'AICc': score = utils.AICc(n_samples * ll, n_features, n_samples) else: raise ValueError(metric + ' is not a valid metric.') # negate the score since lower information criterion is preferable score = -score return score