def worker( boot_inds, X, y, X_noise=0.01, alpha=0.9, lambda_path=np.geomspace(1e-3, 1e-06, num=100), ): X_boot = X[boot_inds, :] y_boot = y[boot_inds] X_boot = scale( scale(X_boot + np.random.normal(scale=X_noise * 1e-6, size=X_boot.shape)) + np.random.normal(scale=X_noise, size=X_boot.shape)) m = LogitNet( alpha=alpha, lambda_path=lambda_path, fit_intercept=False, ) m.fit(X_boot, y_boot) lambdas_enet = m.lambda_path_ coefs_enet = m.coef_path_.squeeze() return { "beta": coefs_enet != 0, "lambda_path": lambdas_enet, }
def test_max_features(self): max_features = 5 m = LogitNet(random_state=1, max_features=max_features) x, y = self.multinomial[3] m = m.fit(x, y) num_features = np.count_nonzero(m.coef_, axis=1) self.assertTrue(np.all(num_features <= max_features))
def likelihood(self, y_obs, y_sim): if not isinstance(y_obs, list): raise TypeError('Observed data is not of allowed types') if not isinstance(y_sim, list): raise TypeError('simulated data is not of allowed types') # Extract summary statistics from the observed data if (self.stat_obs is None or self.data_set != y_obs): self.stat_obs = self.statistics_calc.statistics(y_obs) self.data_set = y_obs # Extract summary statistics from the simulated data stat_sim = self.statistics_calc.statistics(y_sim) # Compute the approximate likelihood for the y_obs given theta y = np.append(np.zeros(self.n_simulate), np.ones(self.n_simulate)) X = np.array(np.concatenate((stat_sim, self.ref_data_stat), axis=0)) m = LogitNet(alpha=1, n_splits=self.n_folds, max_iter=self.max_iter, random_state=self.seed) m = m.fit(X, y) result = np.exp(-np.sum( (m.intercept_ + np.sum(np.multiply(m.coef_, self.stat_obs), axis=1)), axis=0)) return result
def __init__(self, config=None, parallel_cv=True, class_min=0.005): """Initializes logistic regression classifier.""" self.config = self._resolve_config(config, parallel_cv) self.model = LogitNet(**self.config) self.class_min = class_min self.parameter_names = ['lambda', 'intercept', 'coef'] self.store = {}
def distance(self, d1, d2): """Calculates the distance between two datasets. Parameters ---------- d1, d2: list A list, containing a list describing the data set """ if not isinstance(d1, list): raise TypeError('Data is not of allowed types') if not isinstance(d2, list): raise TypeError('Data is not of allowed types') # Extract summary statistics from the dataset if (self.s1 is None or self.data_set != d1): self.s1 = self.statistics_calc.statistics(d1) self.data_set = d1 s2 = self.statistics_calc.statistics(d2) # compute distnace between the statistics training_set_features = np.concatenate((self.s1, s2), axis=0) label_s1 = np.zeros(shape=(len(self.s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() m = LogitNet(alpha=1, n_splits=10) m = m.fit(training_set_features, training_set_labels) distance = 2.0 * (m.cv_mean_score_[np.where( m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5) return distance
def test_one_row_predict_proba(self): # Verify that predict_proba on one row gives 2D output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict_proba(X[0].reshape((1, -1))) assert p.shape == (1, len(np.unique(y)))
def test_one_row_predict(self): # Verify that predicting on one row gives only one row of output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict(X[0].reshape((1, -1))) assert p.shape == (1,)
def test_one_row_predict(self): # Verify that predicting on one row gives only one row of output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict(X[0].reshape((1, -1))) assert p.shape == (1, )
def train_glmnet(train, test, save_path_pred, save_path_model, save_path_json, n_cores=5): ln = LogitNet(alpha=0.5, n_splits=10, n_jobs=n_cores) # to sparse train_sparse = (csc_matrix(train[0]), csc_matrix(train[1].astype(np.float64).reshape((-1, 1)))) test_sparse = (csc_matrix(test[0]), csc_matrix(test[1].astype(np.float64).reshape((-1, 1)))) print("train the model") ln.fit(train_sparse[0], train[1]) print("get predictions") y_pred = ln.predict_proba(test_sparse[0])[:, 1] auprc = cem.auprc(test[1], y_pred) auc = cem.auc(test[1], y_pred) # csv print("save csv") dt = pd.DataFrame({"y_true": test[1], "y_pred": y_pred}) dt.to_csv(save_path_pred) # json print("save json") write_json({"auprc": auprc, "auc": auc}, save_path_json) # model print("save model") pickle.dump(ln, open(save_path_model, "wb"))
def test_random_state_cv(self): random_state = 133 m = LogitNet(random_state=random_state) x, y = self.binomial[0] m.fit(x, y) print(dir(m._cv)) assert m._cv.random_state == random_state
def update(self, batch, batch_index): """Updates the inference state with a new batch and performs LFIRE. Parameters ---------- batch: dict batch_index: int """ # TODO: beautify this super(LFIRE, self).update(batch, batch_index) # Parse likelihood values likelihood = [ batch[summary_name] for summary_name in self.summary_names ] likelihood = np.column_stack(likelihood) # Create training data X = np.vstack((likelihood, self.marginal)) y = np.concatenate((np.ones(likelihood.shape[0]), -1 * np.ones(self.marginal.shape[0]))) # Logistic regression m = LogitNet(**self.logreg_config) m.fit(X, y) # Likelihood value log_likelihood_value = m.intercept_ + np.sum( np.multiply(m.coef_, self.observed)) likelihood_value = np.exp(log_likelihood_value) # Joint prior value parameter_values = [ batch[parameter_name] for parameter_name in self.parameter_names ] joint_prior_value = self.joint_prior.pdf(parameter_values) # Posterior value posterior_value = joint_prior_value * likelihood_value # Check if posterior value is non-finite if np.isinf(posterior_value): params = self.params_grid[batch_index] warnings.warn( f'Posterior value is not finite for parameters \ {self.parameter_names} = {params} and thus will be replaced with zero!', RuntimeWarning) posterior_value = 0 for i, parameter_name in enumerate(self.parameter_names): self.state['infinity'][parameter_name] += [params[i]] # Update state dictionary self.state['posterior'][batch_index] = posterior_value self.state['lambda'][batch_index] = m.lambda_best_ self.state['coef'][batch_index, :] = m.coef_ self.state['intercept'][batch_index] = m.intercept_ for parameter_name in self.parameter_names: self.state[parameter_name][batch_index] = batch[parameter_name]
def test_coef_limits(self): x, y = self.binomial[0] lower_limits = np.repeat(-1, x.shape[1]) upper_limits = 0 m = LogitNet(lower_limits=lower_limits, upper_limits=upper_limits, random_state=69265, alpha=0) m = m.fit(x, y) assert(np.all(m.coef_ >= -1)) assert(np.all(m.coef_ <= 0))
def test_with_pandas_df(self): x, y = make_classification(random_state=1105) df = pd.DataFrame(x) df['y'] = y m = LogitNet(n_folds=3, random_state=123) m = m.fit(df.drop(['y'], axis=1), df.y) sanity_check_logistic(m, x)
def test_with_pandas_df(self): x, y = make_classification(random_state=1105) df = pd.DataFrame(x) df['y'] = y m = LogitNet(n_splits=3, random_state=123) m = m.fit(df.drop(['y'], axis=1), df.y) sanity_check_logistic(m, x)
def test_predict_without_cv(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=399001) m = m.fit(x, y) # should not make prediction unless value is passed for lambda with self.assertRaises(ValueError): m.predict(x)
def test_one_row_predict_proba_with_lambda(self): # One row to predict_proba along with lambdas should give 3D output m = LogitNet(random_state=42) lamb = [0.01, 0.02, 0.04, 0.1] for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict_proba(X[0].reshape((1, -1)), lamb=lamb) assert p.shape == (1, len(np.unique(y)), len(lamb))
def test_predict_without_cv(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=399001) m = m.fit(x, y) # should not make prediction unless value is passed for lambda with self.assertRaises(ValueError): m.predict(x)
def test_coef_limits(self): x, y = self.binomial[0] lower_limits = 0 upper_limits = np.repeat(1, x.shape[1]) m = LogitNet(lower_limits=lower_limits, upper_limits=upper_limits, random_state=69265) m = m.fit(x, y) assert (np.all(m.coef_) >= 0) assert (np.all(m.coef_) <= 1)
def test_single_class_exception(self): x, y = self.binomial[0] y = np.ones_like(y) m = LogitNet() with self.assertRaises(ValueError) as e: m.fit(x, y) self.assertEqual("Training data need to contain at least 2 classes.", str(e.exception))
def test_n_splits(self): x, y = self.binomial[0] for n in self.n_splits: m = LogitNet(n_splits=n, random_state=46657) if n > 0 and n < 3: with self.assertRaisesRegexp(ValueError, "n_splits must be at least 3"): m = m.fit(x, y) else: m = m.fit(x, y) sanity_check_logistic(m, x)
def test_n_folds(self): x, y = self.binomial[0] for n in self.n_folds: m = LogitNet(n_folds=n, random_state=46657) if n > 0 and n < 3: with self.assertRaisesRegexp(ValueError, "n_folds must be at least 3"): m = m.fit(x, y) else: m = m.fit(x, y) sanity_check_logistic(m, x)
def test_cv_scoring_multinomial(self): x, y = self.multinomial[0] for method in self.scoring: m = LogitNet(scoring=method, random_state=488881) if method in self.multinomial_scoring: m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.65, scoring=method) else: with self.assertRaises(ValueError): m.fit(x, y)
def test_coef_limits(self): x, y = self.binomial[0] lower_limits = np.repeat(-1, x.shape[1]) upper_limits = 0 m = LogitNet(lower_limits=lower_limits, upper_limits=upper_limits, random_state=69265, alpha=0) m = m.fit(x, y) assert (np.all(m.coef_ >= -1)) assert (np.all(m.coef_ <= 0))
def test_with_defaults(self): m = LogitNet(random_state=29341) for x, y in itertools.chain(self.binomial, self.multinomial): m = m.fit(x, y) sanity_check_logistic(m, x) # check selection of lambda_best assert m.lambda_best_inx_ <= m.lambda_max_inx_ # check full path predict p = m.predict(x, lamb=m.lambda_path_) assert p.shape[-1] == m.lambda_path_.size
def test_with_defaults(self): m = LogitNet(random_state=29341) for x, y in itertools.chain(self.binomial, self.multinomial): m = m.fit(x, y) sanity_check_logistic(m, x) # check selection of lambda_best ok_(m.lambda_best_inx_ <= m.lambda_max_inx_) # check full path predict p = m.predict(x, lamb=m.lambda_path_) eq_(p.shape[-1], m.lambda_path_.size)
def test_use_sample_weights(self): x, y = self.multinomial[1] class_0_idx = np.where(y == 0) to_drop = class_0_idx[0][:-3] to_keep = np.ones(len(y), dtype=bool) to_keep[to_drop] = False y = y[to_keep] x = x[to_keep, :] sample_weight = class_weight.compute_sample_weight('balanced', y) sample_weight[0] = 0. unweighted = LogitNet(random_state=2, scoring='f1_micro') unweighted = unweighted.fit(x, y) unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight, average='micro') weighted = LogitNet(random_state=2, scoring='f1_micro') weighted = weighted.fit(x, y, sample_weight=sample_weight) weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight, average='micro') self.assertTrue(weighted_acc >= unweighted_acc)
def test_coef_interpolation(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=561) m = m.fit(x, y) # predict for a value of lambda between two values on the computed path lamb_lo = m.lambda_path_[1] lamb_hi = m.lambda_path_[2] # a value not equal to one on the computed path lamb_mid = (lamb_lo + lamb_hi) / 2.0 pred_lo = m.predict_proba(x, lamb=lamb_lo) pred_hi = m.predict_proba(x, lamb=lamb_hi) pred_mid = m.predict_proba(x, lamb=lamb_mid) self.assertFalse(np.allclose(pred_lo, pred_mid)) self.assertFalse(np.allclose(pred_hi, pred_mid))
def distance(self, d1, d2): # Extract summary statistics from the dataset s1 = self.statistics_calc.statistics(d1) s2 = self.statistics_calc.statistics(d2) # compute distnace between the statistics training_set_features = np.concatenate((s1, s2), axis=0) label_s1 = np.zeros(shape=(len(s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() m = LogitNet(alpha=1, n_splits=10) m = m.fit(training_set_features, training_set_labels) distance = 2.0 * (m.cv_mean_score_[np.where( m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5) return distance
def test_coef_interpolation(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=561) m = m.fit(x, y) # predict for a value of lambda between two values on the computed path lamb_lo = m.lambda_path_[1] lamb_hi = m.lambda_path_[2] # a value not equal to one on the computed path lamb_mid = (lamb_lo + lamb_hi) / 2.0 pred_lo = m.predict_proba(x, lamb=lamb_lo) pred_hi = m.predict_proba(x, lamb=lamb_hi) pred_mid = m.predict_proba(x, lamb=lamb_mid) self.assertFalse(np.allclose(pred_lo, pred_mid)) self.assertFalse(np.allclose(pred_hi, pred_mid))
def distance(self, d1, d2): """Calculates the distance between two datasets. Parameters ---------- d1: Python list Contains n1 data points. d2: Python list Contains n2 data points. Returns ------- numpy.float The distance between the two input data sets. """ s1, s2 = self._calculate_summary_stat(d1, d2) self.n_simulate = s1.shape[0] if not s2.shape[0] == self.n_simulate: raise RuntimeError( "The number of simulations in the two data sets should be the same in order for " "the classification accuracy implemented in PenLogReg to be a proper distance. Please " "check that `n_samples` in the `sample()` method for the sampler is equal to " "the number of datasets in the observations.") # compute distance between the statistics training_set_features = np.concatenate((s1, s2), axis=0) label_s1 = np.zeros(shape=(len(s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() groups = np.repeat(np.arange(self.n_folds), np.int(np.ceil(self.n_simulate / self.n_folds))) groups = groups[:self.n_simulate].tolist() groups += groups # duplicate it as groups need to be defined for both datasets m = LogitNet( alpha=1, n_splits=self.n_folds) # note we are not using random seed here! m = m.fit(training_set_features, training_set_labels, groups=groups) distance = 2.0 * (m.cv_mean_score_[np.where( m.lambda_path_ == m.lambda_max_)[0][0]] - 0.5) return distance
def loglikelihood(self, y_obs, y_sim): if not isinstance(y_obs, list): raise TypeError('Observed data is not of allowed types') if not isinstance(y_sim, list): raise TypeError('simulated data is not of allowed types') # Check whether y_obs is same as the stored dataset. if self.data_set is not None: # check that the the observations have the same length; if not, they can't be the same: if len(y_obs) != len(self.data_set): self.dataSame = False elif len(np.array(y_obs[0]).reshape(-1, )) == 1: self.dataSame = self.data_set == y_obs else: # otherwise it fails when y_obs[0] is array self.dataSame = all( [(np.array(self.data_set[i]) == np.array(y_obs[i])).all() for i in range(len(y_obs))]) if self.stat_obs is None or self.dataSame is False: self.stat_obs = self.statistics_calc.statistics(y_obs) self.data_set = y_obs # Extract summary statistics from the simulated data stat_sim = self.statistics_calc.statistics(y_sim) if not stat_sim.shape[0] == self.n_simulate: raise RuntimeError("The number of samples in the reference data set is not the same as the number of " "samples in the generated data. Please check that `n_samples` in the `sample()` method" "for the sampler is equal to `n_simulate` in PenLogReg.") # Compute the approximate likelihood for the y_obs given theta y = np.append(np.zeros(self.n_simulate), np.ones(self.n_simulate)) X = np.array(np.concatenate((stat_sim, self.ref_data_stat), axis=0)) # define here groups for cross-validation: groups = np.repeat(np.arange(self.n_folds), np.int(np.ceil(self.n_simulate / self.n_folds))) groups = groups[:self.n_simulate].tolist() groups += groups # duplicate it as groups need to be defined for both datasets m = LogitNet(alpha=1, n_splits=self.n_folds, max_iter=self.max_iter, random_state=self.seed, scoring="log_loss") m = m.fit(X, y, groups=groups) result = -np.sum((m.intercept_ + np.sum(np.multiply(m.coef_, self.stat_obs), axis=1)), axis=0) return result
def test_lambda_clip_warning(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=1729) m = m.fit(x, y) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[0] + 1) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[-1] - 1)
def create_estimator(self): """ Create an estimator. Creates an estimator depending on the family of regression. :return: A scikit-learn estimator. """ if self.family == 'gaussian': estimator = ElasticNet(standardize=False, cut_point=0) elif self.family == 'binomial': estimator = LogitNet(standardize=False, cut_point=0) return estimator
def _parallel_permute_count_nonzero_penalised_coefs(xp, yp, lam_path, penalties, norm_num, is_regression): from glmnet import ElasticNet, LogitNet np.random.shuffle(yp) params = dict(alpha=norm_num, lambda_path=lam_path) pm = ElasticNet(**params) if is_regression else LogitNet(**params) pm.fit(xp, yp, relative_penalties=penalties) return np.sign( np.abs(np.squeeze(pm.coef_path_)) * vec_to_array(penalties)).sum(axis=0)
def __init__(self, penalty_free_indices=list(), min_lambda_ratio=1e-3, n_lambdas=250, cv=10, is_regression=True, norm_num=1): from glmnet import ElasticNet, LogitNet if not (isinstance(penalty_free_indices, list) or isinstance(penalty_free_indices, np.ndarray)): raise ValueError('ols_indices must be a list or np.array') if is_regression: self.model = ElasticNet(norm_num, n_lambdas, min_lambda_ratio, n_splits=cv, n_jobs=cpu_count()) else: self.model = LogitNet(norm_num, n_lambdas, min_lambda_ratio, n_splits=cv, n_jobs=cpu_count()) self.norm_num = norm_num self.ols_idx = penalty_free_indices self.is_regression = is_regression self.n = None self.p = None self.coef_path = None self.lambdas = None self.fdr_grid = None self.fdr_analytic_grid = None self.n_nonzero_true_coefs = None self.mean_n_false_positive_coefs = None
def test_lambda_clip_warning(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=1729) m = m.fit(x, y) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[0] + 1) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[-1] - 1)
def avg_graphs(X, f_true, method): assert method in methods n_simulations = 10 probs = [[1 - ff, ff] for ff in f_true] avg_square_bias, avg_variance, avg_mse = defaultdict(list), defaultdict( list), defaultdict(list) loop_list = np.arange(1, 101, 1) if method == 'knn' else np.exp( np.arange(-3, 7, 0.1)) for simulation in range(n_simulations): y = [np.random.choice(2, p=prob) for prob in probs] for l in loop_list: if method == 'knn': model = KNeighborsClassifier(n_neighbors=l).fit(X.T, y) elif method == 'lasso': model = LogisticRegression(penalty='l1', solver='liblinear', C=l).fit(X.T, y) else: model = LogitNet(alpha=0, lambda_path=[l]).fit(X.T, y) f_hat = model.predict_proba(X.T)[:, 1] x_val = l if method == 'knn' else (np.count_nonzero( model.coef_) if method == 'lasso' else np.log(l)) avg_square_bias[x_val].append(mean_squared_error(f_true, f_hat.T)) avg_variance[x_val].append(np.mean(np.var(f_hat))) avg_mse[x_val].append(mean_squared_error(f_hat.T, y)) asb_x, asb_y = reorder_dict(avg_square_bias) av_x, av_y = reorder_dict(avg_variance) am_x, am_y = reorder_dict(avg_mse) plt.plot(asb_x, asb_y, label='avg_square_bias') plt.plot(av_x, av_y, label='avg_variance') plt.plot(am_x, am_y, label='avg_MSE') plt.title(f"graphs for {method} predictor") plt.xlabel('k' if method == 'knn' else ( 'num of non-zero coefficients' if method == 'lasso' else 'log lambda')) plt.legend() plt.show() print( f"for {method}, the optimal MSE is {np.min(am_y)} and we get it when {x_label_dict[method]} is {am_x[np.argmin(am_y)]}" )
def test_relative_penalties(self): x, y = self.binomial[0] p = x.shape[1] # m1 no relative penalties applied m1 = LogitNet(alpha=1) m1.fit(x, y) # find the nonzero indices from LASSO nonzero = np.nonzero(m1.coef_[0]) # unpenalize those nonzero coefs penalty = np.repeat(1, p) penalty[nonzero] = 0 # refit the model with the unpenalized coefs m2 = LogitNet(alpha=1) m2.fit(x, y, relative_penalties=penalty) # verify that the unpenalized coef ests exceed the penalized ones # in absolute value assert (np.all(np.abs(m1.coef_[0]) <= np.abs(m2.coef_[0])))
def test_use_sample_weights(self): x, y = self.multinomial[1] class_0_idx = np.where(y==0) to_drop = class_0_idx[0][:-3] to_keep = np.ones(len(y), dtype=bool) to_keep[to_drop] = False y = y[to_keep] x = x[to_keep, :] sample_weight = class_weight.compute_sample_weight('balanced', y) sample_weight[0] = 0. unweighted = LogitNet(random_state=2, scoring='f1_micro') unweighted = unweighted.fit(x, y) unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight, average='micro') weighted = LogitNet(random_state=2, scoring='f1_micro') weighted = weighted.fit(x, y, sample_weight=sample_weight) weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight, average='micro') self.assertTrue(weighted_acc >= unweighted_acc)
def test_relative_penalties(self): x, y = self.binomial[0] p = x.shape[1] # m1 no relative penalties applied m1 = LogitNet(alpha=1) m1.fit(x, y) # find the nonzero indices from LASSO nonzero = np.nonzero(m1.coef_[0]) # unpenalize those nonzero coefs penalty = np.repeat(1, p) penalty[nonzero] = 0 # refit the model with the unpenalized coefs m2 = LogitNet(alpha=1) m2.fit(x, y, relative_penalties=penalty) # verify that the unpenalized coef ests exceed the penalized ones # in absolute value assert(np.all(np.abs(m1.coef_[0]) <= np.abs(m2.coef_[0])))
def test_cv_scoring(self): x, y = self.binomial[0] for method in self.scoring: m = LogitNet(scoring=method, random_state=52633) m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.85, scoring=method)
def test_alphas(self): x, y = self.binomial[0] for alpha in self.alphas: m = LogitNet(alpha=alpha, random_state=41041) m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.85, alpha=alpha)