def test_LearnerSCCS_confidence_intervals(self): lrn = ConvSCCS(n_lags=self.n_lags, penalized_features=[]) coeffs, _ = lrn.fit(self.features, self.labels, self.censoring) p_features, p_labels, p_censoring = lrn._preprocess_data( self.features, self.labels, self.censoring) confidence_intervals = lrn._bootstrap(p_features, p_labels, p_censoring, np.hstack(coeffs), 5, .90) for i, c in enumerate(coeffs): self.assertTrue( np.all(confidence_intervals.lower_bound[i] <= c), "lower bound of the confidence interval\ should be <= coeffs at index %i" % i) self.assertTrue( np.all(c <= confidence_intervals.upper_bound[i]), "upper bound of the confidence interval\ should be >= coeffs at index %i" % i) # Same with 0 lags n_lags = np.zeros_like(self.n_lags, dtype='uint64') lrn = ConvSCCS(n_lags=n_lags, penalized_features=[]) coeffs, _ = lrn.fit(self.features, self.labels, self.censoring) p_features, p_labels, p_censoring = lrn._preprocess_data( self.features, self.labels, self.censoring) confidence_intervals = lrn._bootstrap(p_features, p_labels, p_censoring, np.hstack(coeffs), 5, .90) for i, c in enumerate(coeffs): self.assertTrue( np.all(confidence_intervals.lower_bound[i] <= c), "lower bound of the confidence interval\ should be <= coeffs at index %i" % i) self.assertTrue( np.all(c <= confidence_intervals.upper_bound[i]), "upper bound of the confidence interval\ should be >= coeffs at index %i" % i)
def test_LearnerSCCS_coefficient_groups(self): n_lags = np.array([4, 0, 3, 4], dtype='uint64') n_features = len(n_lags) n_coeffs = (n_lags + 1).sum() coeffs = np.ones((n_coeffs, )) # 1st feature coeffs[1:3] = 2 # 2nd feature coeffs[5] = 1 # 3rd feature coeffs[6:8] = 0 # 4th feature coeffs[10:] = np.array([1, 2, 3, 4, 4]) expected_equality_groups = [(1, 3), (3, 5), (6, 8), (8, 10), (13, 15)] lrn = ConvSCCS(n_lags=n_lags, penalized_features=np.arange(4)) lrn._set("n_features", n_features) equality_groups = lrn._detect_support(coeffs) self.assertEqual(expected_equality_groups, equality_groups)
def test_LearnerSCCS_fit(self): seed = 42 n_lags = np.repeat(2, 2).astype('uint64') sim = SimuSCCS(n_cases=800, n_intervals=10, n_features=2, n_lags=n_lags, verbose=False, seed=seed, exposure_type='multiple_exposures') features, _, labels, censoring, coeffs = sim.simulate() lrn = ConvSCCS(n_lags=n_lags, penalized_features=[], tol=0, max_iter=10, random_state=seed) estimated_coeffs, _ = lrn.fit(features, labels, censoring) np.testing.assert_almost_equal(np.hstack(estimated_coeffs), np.hstack(coeffs), decimal=1)
def test_LearnerSCCS_score(self): lrn = ConvSCCS(n_lags=self.n_lags, penalized_features=[], random_state=self.seed) lrn.fit(self.features, self.labels, self.censoring) self.assertEqual(lrn.score(), lrn.score(self.features, self.labels, self.censoring))
def test_LearnerSCCS_preprocess(self): features = [ np.array([[0, 1, 0], [0, 0, 0], [0, 1, 1]], dtype="float64"), np.array([[1, 0, 1], [0, 0, 1], [1, 0, 0]], dtype="float64"), np.array([[1, 1, 1], [0, 0, 1], [1, 1, 0]], dtype="float64") ] sparse_features = [csr_matrix(f, shape=(3, 3)) for f in features] labels = [ np.array([0, 0, 1], dtype="uint64"), np.array([0, 1, 0], dtype="uint64"), np.array([0, 0, 0], dtype="uint64") ] censoring = np.array([2, 3, 3], dtype="uint64") n_lags = np.array([1, 1, 0], dtype="uint64") expected_features = [ np.array([[0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0]]), np.array([[1, 0, 0, 0, 1], [0, 1, 0, 0, 1], [1, 0, 0, 0, 0]]) ] expected_labels = [ np.array([0, 0, 1], dtype="uint64"), np.array([0, 1, 0], dtype="uint64"), ] expected_censoring = np.array([2, 3], dtype="uint64") lrn = ConvSCCS(n_lags=n_lags, penalized_features=[]) X, y, c = lrn._prefit(sparse_features, labels, censoring) [ np.testing.assert_array_equal(f.toarray(), expected_features[i]) for i, f in enumerate(X) ] [ np.testing.assert_array_equal(l, expected_labels[i]) for i, l in enumerate(y) ] np.testing.assert_array_equal(c, expected_censoring)
def test_LearnerSCCS_fit_KFold_CV(self): lrn = ConvSCCS(n_lags=self.n_lags, penalized_features=np.arange(self.n_features), random_state=self.seed, C_tv=1e-1, C_group_l1=1e-1) lrn.fit(self.features, self.labels, self.censoring) score = lrn.score() tv_range = (-5, -1) groupl1_range = (-5, -1) lrn.fit_kfold_cv(self.features, self.labels, self.censoring, C_tv_range=tv_range, C_group_l1_range=groupl1_range, n_cv_iter=4) self.assertTrue(lrn.score() <= score)
def test_LearnerSCCS_fit_KFold_CV(self): lrn_scores = [] def fit(lrn): self.setUp() lrn.fit(self.features, self.labels, self.censoring) score = lrn.score() tv_range = (-5, -1) groupl1_range = (-5, -1) lrn.fit_kfold_cv(self.features, self.labels, self.censoring, C_tv_range=tv_range, C_group_l1_range=groupl1_range, n_cv_iter=4) lrn_scores.append(lrn.score()) self.assertTrue(lrn_scores[-1] <= score) fit( ConvSCCS(n_lags=self.n_lags, penalized_features=np.arange(self.n_features), random_state=self.seed, C_tv=1e-1, C_group_l1=1e-1)) fit( BatchConvSCCS(n_lags=self.n_lags, penalized_features=np.arange(self.n_features), random_state=self.seed, C_tv=1e-1, C_group_l1=1e-1)) fit( StreamConvSCCS(n_lags=self.n_lags, penalized_features=np.arange(self.n_features), random_state=self.seed, C_tv=1e-1, C_group_l1=1e-1))
experiment_id=experiment_id, version=version, seed=seed, sim_adjacency_matrix=dumps(adjacency_matrix), features_frequency=dumps(exposures_frequencies), ) session.merge(sim_log) session.commit() n_features = n_base_features n_lags = np.repeat(49, n_features).astype("uint64") start = time() lrn = ConvSCCS( n_lags=n_lags, penalized_features=np.arange(n_features), verbose=False ) C_tv_range = (1, 5) C_group_l1_range = (1, 5) fitted_coeffs, cv_track = lrn.fit_kfold_cv( censored_features, labels, censoring, C_tv_range=C_tv_range, C_group_l1_range=C_group_l1_range, confidence_intervals=False, ) # WARNING: no bootstrap in this simulation elapsed_time = time() - start model_id = "ConvSCCS" model_log = ConvSCCSModel(
n_agegrps = len(agegrps) - 1 feat_agegrp = np.zeros((n_intervals, n_agegrps)) for i in range(n_agegrps): feat_agegrp[agegrps[i]:agegrps[i + 1], i] = 1 feat_agegrp = csr_matrix(feat_agegrp) features = [hstack([f, feat_agegrp]).tocsr() for f in features] censored_features = [ hstack([f, feat_agegrp]).tocsr() for f in censored_features ] n_lags = np.hstack([n_lags, np.zeros(n_agegrps)]) start = time() lrn = ConvSCCS( n_lags=n_lags, penalized_features=np.arange(n_features), verbose=False ) C_tv_range = (1, 5) C_group_l1_range = (1, 5) fitted_coeffs, cv_track = lrn.fit_kfold_cv( censored_features, labels, censoring, C_tv_range=C_tv_range, C_group_l1_range=C_group_l1_range, confidence_intervals=True ) elapsed_time = time() - start model_id = "ConvSCCS" model_log = ConvSCCSModel(
# C_TV_range, C_L1_range, # confidence_intervals=True, # n_samples_bootstrap=20, n_cv_iter=50) # elapsed_time = time() - start # print("Elapsed time (model training): %.2f seconds \n" % elapsed_time) # print("Best model hyper parameters: \n") # print("C_tv : %f \n" % cv_track.best_model['C_tv']) # print("C_group_l1 : %f \n" % cv_track.best_model['C_group_l1']) # cv_track.plot_cv_report(35, 45) # plt.show() # confidence_intervals = cv_track.best_model['confidence_intervals'] # using the parameters resulting from cross-validation learner = ConvSCCS(n_lags=n_lags.astype('uint64'), penalized_features=np.arange(n_features), random_state=42, C_tv=270.2722840570933, C_group_l1=5216.472772625124) _, confidence_intervals = learner.fit(features, labels, censoring, confidence_intervals=True, n_samples_bootstrap=20) # Plot estimated parameters # get bootstrap confidence intervals refitted_coeffs = confidence_intervals['refit_coeffs'] lower_bound = confidence_intervals['lower_bound'] upper_bound = confidence_intervals['upper_bound']