def refit_and_predict(cut_points_estimates, X_train, X_test, Y_train, delta_train, Y_test, delta_test): binarizer = FeaturesBinarizer(method='given', bins_boundaries=cut_points_estimates, remove_first=True) binarizer.fit(pd.concat([X_train, X_test])) X_bin_train = binarizer.transform(X_train) X_bin_test = binarizer.transform(X_test) learner = CoxRegression(penalty='none', tol=1e-5, solver='agd', verbose=False, max_iter=100, step=0.3, warm_start=True) learner._solver_obj.linesearch = False learner.fit(X_bin_train, Y_train, delta_train) coeffs = learner.coeffs marker = X_bin_test.dot(coeffs) lp_train = X_bin_train.dot(coeffs) c_index = concordance_index(Y_test, marker, delta_test) c_index = max(c_index, 1 - c_index) return c_index, marker, lp_train
def get_times2(n_simu, n_samples, n_features, n_cut_points): print(" n_simu=%s" % n_simu) seed = n_simu simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features, seed=seed, verbose=False, n_cut_points=n_cut_points, shape=2, scale=.1, cov_corr=cov_corr, sparsity=sparsity) X, Y, delta, cut_points, beta_star, S = simu.simulate() # Binacox method tic = time() n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts) X_bin = binarizer.fit_transform(X) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length solver = 'agd' learner = CoxRegression(penalty='binarsity', tol=1e-5, solver=solver, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, C=25, warm_start=True) learner._solver_obj.linesearch = False learner.fit(X_bin, Y, delta) tac = time() return tac - tic
def test_CoxRegression_solver_step(self): """...Test CoxRegression setting of step parameter of solver """ for solver in self.solvers: learner = CoxRegression(solver=solver, step=self.float_1) self.assertEqual(learner.step, self.float_1) self.assertEqual(learner._solver_obj.step, self.float_1) learner.step = self.float_2 self.assertEqual(learner.step, self.float_2) self.assertEqual(learner._solver_obj.step, self.float_2)
def get_times1(n_simu, n_samples, n_features, n_cut_points): print(" n_simu=%s" % n_simu) seed = n_simu simu = SimuCoxRegWithCutPoints(n_samples=n_samples, n_features=n_features, seed=seed, verbose=False, n_cut_points=n_cut_points, shape=2, scale=.1, cov_corr=cov_corr, sparsity=sparsity) X, Y, delta, cut_points, beta_star, S = simu.simulate() # Binacox method n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts) X_bin = binarizer.fit_transform(X) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length boundaries = binarizer.boundaries['0'] solver = 'agd' learner = CoxRegression(penalty='binarsity', tol=1e-5, solver=solver, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, C=25, warm_start=True) learner._solver_obj.linesearch = False learner.fit(X_bin, Y, delta) tac = time() time_bina = tac - tic # Auto Cutoff Method X = np.array(X) epsilon = 10 p1 = np.percentile(X, epsilon) p2 = np.percentile(X, 100 - epsilon) values_to_test = X[np.where((X <= p2) & (X >= p1))] tic = time() get_p_values_j(X, 0, Y, delta, values_to_test, epsilon) tac = time() time_ac_all = tac - tic tic = time() p1 = np.percentile(X, epsilon) p2 = np.percentile(X, 100 - epsilon) values_to_test = boundaries[ np.where((boundaries <= p2) & (boundaries >= p1))] get_p_values_j(X, 0, Y, delta, values_to_test, epsilon) tac = time() time_ac_grid = tac - tic return n_samples, time_bina, time_ac_all, time_ac_grid
def test_CoxRegression_penalty_elastic_net_ratio(self): """...Test CoxRegression setting of parameter of elastic_net_ratio """ ratio_1 = 0.6 ratio_2 = 0.3 for penalty in self.penalties: if penalty == 'elasticnet': learner = CoxRegression(penalty=penalty, C=self.float_1, elastic_net_ratio=ratio_1) self.assertEqual(learner.C, self.float_1) self.assertEqual(learner.elastic_net_ratio, ratio_1) self.assertEqual(learner._prox_obj.strength, 1. / self.float_1) self.assertEqual(learner._prox_obj.ratio, ratio_1) learner.elastic_net_ratio = ratio_2 self.assertEqual(learner.C, self.float_1) self.assertEqual(learner.elastic_net_ratio, ratio_2) self.assertEqual(learner._prox_obj.ratio, ratio_2) else: msg = '^Penalty "%s" has no elastic_net_ratio attribute$' % \ penalty with self.assertWarnsRegex(RuntimeWarning, msg): if penalty == 'binarsity': CoxRegression(penalty=penalty, elastic_net_ratio=0.8, blocks_start=[0], blocks_length=[1]) else: CoxRegression(penalty=penalty, elastic_net_ratio=0.8) if penalty == 'binarsity': learner = CoxRegression(penalty=penalty, blocks_start=[0], blocks_length=[1]) else: learner = CoxRegression(penalty=penalty) with self.assertWarnsRegex(RuntimeWarning, msg): learner.elastic_net_ratio = ratio_1
def test_CoxRegression_settings(self): """...Test CoxRegression basic settings """ # solver solver_class_map = {'gd': GD, 'agd': AGD} for solver in self.solvers: learner = CoxRegression(solver=solver) solver_class = solver_class_map[solver] self.assertTrue(isinstance(learner._solver_obj, solver_class)) msg = '^``solver`` must be one of agd, gd, got wrong_name$' with self.assertRaisesRegex(ValueError, msg): CoxRegression(solver='wrong_name') # prox prox_class_map = { 'none': ProxZero, 'l1': ProxL1, 'l2': ProxL2Sq, 'elasticnet': ProxElasticNet, 'tv': ProxTV, 'binarsity': ProxBinarsity } for penalty in self.penalties: if penalty == 'binarsity': learner = CoxRegression(penalty=penalty, blocks_start=[0], blocks_length=[1]) else: learner = CoxRegression(penalty=penalty) prox_class = prox_class_map[penalty] self.assertTrue(isinstance(learner._prox_obj, prox_class)) msg = '^``penalty`` must be one of binarsity, elasticnet, l1, l2, none, ' \ 'tv, got wrong_name$' with self.assertRaisesRegex(ValueError, msg): CoxRegression(penalty='wrong_name')
def test_CoxRegression_score(self): """...Test CoxRegression score """ features, times, censoring = Test.get_train_data() learner = CoxRegression() learner.fit(features, times, censoring) self.assertAlmostEqual(learner.score(), 3.856303803547875) features, times, censoring = Test.get_train_data(seed=123) self.assertAlmostEqual(learner.score(features, times, censoring), 5.556509086276002) msg = '^You must fit the model first$' learner = CoxRegression() with self.assertRaisesRegex(RuntimeError, msg): learner.score() msg = '^Passed ``features`` is None$' learner = CoxRegression().fit(features, times, censoring) with self.assertRaisesRegex(ValueError, msg): learner.score(None, times, censoring) msg = '^Passed ``times`` is None$' learner = CoxRegression().fit(features, times, censoring) with self.assertRaisesRegex(ValueError, msg): learner.score(times, None, censoring) msg = '^Passed ``censoring`` is None$' learner = CoxRegression().fit(features, times, censoring) with self.assertRaisesRegex(ValueError, msg): learner.score(features, times, None)
def test_CoxRegression_fit(self): """...Test CoxRegression fit with different solvers and penalties """ raw_features, times, censoring = Test.get_train_data() coeffs_pen = { 'none': np.array([ -0.03068462, 0.03940001, 0.16758354, -0.24838003, 0.16940664, 0.9650363, -0.14818724, -0.0802245, -1.52869811, 0.0414509 ]), 'l2': np.array([ -0.02403681, 0.03455527, 0.13470436, -0.21654892, 0.16637723, 0.83125941, -0.08555382, -0.12914753, -1.35294435, 0.02312935 ]), 'l1': np.array([ 0., 1.48439371e-02, 1.03806171e-01, -1.57313537e-01, 1.40448847e-01, 8.05306416e-01, -5.41296030e-02, -1.07753576e-01, -1.37612207e+00, 6.43289248e-05 ]), 'elasticnet': np.array([ 0., 0.01011823, 0.10530518, -0.16885214, 0.14373715, 0.82547312, -0.06122141, -0.09479487, -1.39454662, 0.00312597 ]), 'tv': np.array([ 0.03017556, 0.03714465, 0.0385349, -0.10169967, 0.15783755, 0.64860815, -0.00617636, -0.22235137, -1.07938977, -0.07181225 ]), 'binarsity': np.array([ 0.03794176, -0.04473702, 0.00339763, 0.00339763, -0.16493989, 0.05497996, 0.05497996, 0.05497996, -0.08457476, -0.08457476, 0.0294825, 0.13966702, 0.10251257, 0.02550264, -0.07207419, -0.05594102, -0.10018038, -0.10018038, 0.10018038, 0.10018038, -0.47859686, -0.06685181, -0.00850803, 0.55395669, 0.00556327, -0.00185442, -0.00185442, -0.00185442, 0.26010429, 0.09752455, -0.17881442, -0.17881442, 0.932516, 0.32095387, -0.49766315, -0.75580671, 0.0593833, -0.01433773, 0.01077109, -0.05581666 ]) } for penalty in self.penalties: if penalty == 'binarsity': # binarize features n_cuts = 3 binarizer = FeaturesBinarizer(n_cuts=n_cuts) features = binarizer.fit_transform(raw_features) else: features = raw_features for solver in self.solvers: solver_kwargs = { 'penalty': penalty, 'tol': 0, 'solver': solver, 'verbose': False, 'max_iter': 10 } if penalty != 'none': solver_kwargs['C'] = 50 if penalty == 'binarsity': solver_kwargs['blocks_start'] = binarizer.blocks_start solver_kwargs['blocks_length'] = binarizer.blocks_length learner = CoxRegression(**solver_kwargs) learner.fit(features, times, censoring) np.testing.assert_array_almost_equal(coeffs_pen[penalty], learner.coeffs, decimal=1)
def test_CoxRegression_solver_basic_settings(self): """...Test CoxRegression setting of basic parameters of solver """ for solver in self.solvers: # tol learner = CoxRegression(solver=solver, tol=self.float_1) self.assertEqual(learner.tol, self.float_1) self.assertEqual(learner._solver_obj.tol, self.float_1) learner.tol = self.float_2 self.assertEqual(learner.tol, self.float_2) self.assertEqual(learner._solver_obj.tol, self.float_2) # max_iter learner = CoxRegression(solver=solver, max_iter=self.int_1) self.assertEqual(learner.max_iter, self.int_1) self.assertEqual(learner._solver_obj.max_iter, self.int_1) learner.max_iter = self.int_2 self.assertEqual(learner.max_iter, self.int_2) self.assertEqual(learner._solver_obj.max_iter, self.int_2) # verbose learner = CoxRegression(solver=solver, verbose=True) self.assertEqual(learner.verbose, True) self.assertEqual(learner._solver_obj.verbose, True) learner.verbose = False self.assertEqual(learner.verbose, False) self.assertEqual(learner._solver_obj.verbose, False) learner = CoxRegression(solver=solver, verbose=False) self.assertEqual(learner.verbose, False) self.assertEqual(learner._solver_obj.verbose, False) learner.verbose = True self.assertEqual(learner.verbose, True) self.assertEqual(learner._solver_obj.verbose, True) # print_every learner = CoxRegression(solver=solver, print_every=self.int_1) self.assertEqual(learner.print_every, self.int_1) self.assertEqual(learner._solver_obj.print_every, self.int_1) learner.print_every = self.int_2 self.assertEqual(learner.print_every, self.int_2) self.assertEqual(learner._solver_obj.print_every, self.int_2) # record_every learner = CoxRegression(solver=solver, record_every=self.int_1) self.assertEqual(learner.record_every, self.int_1) self.assertEqual(learner._solver_obj.record_every, self.int_1) learner.record_every = self.int_2 self.assertEqual(learner.record_every, self.int_2) self.assertEqual(learner._solver_obj.record_every, self.int_2)
def test_CoxRegression_penalty_C(self): """...Test CoxRegression setting of parameter of C """ for penalty in self.penalties: if penalty != 'none': if penalty == 'binarsity': learner = CoxRegression(penalty=penalty, C=self.float_1, blocks_start=[0], blocks_length=[1]) else: learner = CoxRegression(penalty=penalty, C=self.float_1) self.assertEqual(learner.C, self.float_1) self.assertEqual(learner._prox_obj.strength, 1. / self.float_1) learner.C = self.float_2 self.assertEqual(learner.C, self.float_2) self.assertEqual(learner._prox_obj.strength, 1. / self.float_2) msg = '^``C`` must be positive, got -1$' with self.assertRaisesRegex(ValueError, msg): if penalty == 'binarsity': CoxRegression(penalty=penalty, C=-1, blocks_start=[0], blocks_length=[1]) else: CoxRegression(penalty=penalty, C=-1) else: msg = '^You cannot set C for penalty "%s"$' % penalty with self.assertWarnsRegex(RuntimeWarning, msg): CoxRegression(penalty=penalty, C=self.float_1) learner = CoxRegression(penalty=penalty) with self.assertWarnsRegex(RuntimeWarning, msg): learner.C = self.float_1 msg = '^``C`` must be positive, got -2$' with self.assertRaisesRegex(ValueError, msg): learner.C = -2
def test_CoxRegression_warm_start(self): """...Test CoxRegression warm start """ features, times, censoring = Test.get_train_data() for solver in self.solvers: solver_kwargs = { 'solver': solver, 'max_iter': 2, 'warm_start': True, 'tol': 0, 'penalty': 'none' } learner = CoxRegression(**solver_kwargs) learner.fit(features, times, censoring) score_1 = learner.score() learner.fit(features, times, censoring) score_2 = learner.score() # Thanks to warm start the score should have decreased (no # penalization here) self.assertLess(score_2, score_1) for solver in self.solvers: solver_kwargs = { 'solver': solver, 'max_iter': 2, 'warm_start': False, 'tol': 0, 'penalty': 'none' } learner = CoxRegression(**solver_kwargs) learner.fit(features, times, censoring) score_1 = learner.score() learner.fit(features, times, censoring) score_2 = learner.score() # No warm start here, so the scores should be the same self.assertAlmostEqual(score_2, score_1)
def fit_and_score(features, features_bin, times, censoring, blocks_start, blocks_length, boundaries, features_names, idx_train, idx_test, validation_data, C): if features_names is None: features_names = [str(j) for j in range(features.shape[1])] X_train, X_test = features_bin[idx_train], features_bin[idx_test] Y_train, Y_test = times[idx_train], times[idx_test] delta_train, delta_test = censoring[idx_train], censoring[idx_test] learner = CoxRegression(penalty='binarsity', tol=1e-5, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, warm_start=True) learner._solver_obj.linesearch = False learner.C = C learner.fit(X_train, Y_train, delta_train) coeffs = learner.coeffs cut_points_estimates = {} for j, start in enumerate(blocks_start): coeffs_j = coeffs[start:start + blocks_length[j]] all_zeros = not np.any(coeffs_j) if all_zeros: cut_points_estimate_j = np.array([-np.inf, np.inf]) else: groups_j = get_groups(coeffs_j) jump_j = np.where(groups_j[1:] - groups_j[:-1] != 0)[0] + 1 if jump_j.size == 0: cut_points_estimate_j = np.array([-np.inf, np.inf]) else: cut_points_estimate_j = boundaries[features_names[j]][jump_j] if cut_points_estimate_j[0] != -np.inf: cut_points_estimate_j = np.insert(cut_points_estimate_j, 0, -np.inf) if cut_points_estimate_j[-1] != np.inf: cut_points_estimate_j = np.append(cut_points_estimate_j, np.inf) cut_points_estimates[features_names[j]] = cut_points_estimate_j binarizer = FeaturesBinarizer(method='given', bins_boundaries=cut_points_estimates) binarized_features = binarizer.fit_transform(features) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length X_bin_train = binarized_features[idx_train] X_bin_test = binarized_features[idx_test] learner_ = CoxRegression(penalty='binarsity', tol=1e-5, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, warm_start=True, C=1e10) learner_._solver_obj.linesearch = False learner_.fit(X_bin_train, Y_train, delta_train) score = learner_.score(X_bin_test, Y_test, delta_test) if validation_data is not None: X_validation = validation_data[0] X_bin_validation = binarizer.fit_transform(X_validation) Y_validation = validation_data[1] delta_validation = validation_data[2] score_validation = learner_.score(X_bin_validation, Y_validation, delta_validation) else: score_validation = None return score, score_validation
rs = ShuffleSplit(n_splits=1, test_size=test_size) for train_index, test_index in rs.split(X): X_test = X.iloc[test_index, :] Y_test = Y[test_index] delta_test = delta[test_index] X_train = X.iloc[train_index, :] Y_train = Y[train_index] delta_train = delta[train_index] # 2) screening cox, top-P features n_features = X_train.shape[1] screening_cox = pd.Series(index=X_train.columns) learner = CoxRegression(tol=1e-5, solver='agd', verbose=False, penalty='none', max_iter=100) for j in range(n_features): stdout.write("\rscreening: %d/%s" % (j + 1, n_features)) stdout.flush() feat_name = X_train.columns[j] X_j = X_train[[feat_name]] learner.fit(X_j, Y_train, delta_train) coeffs = learner.coeffs marker = X_j.dot(coeffs) c_index = concordance_index(Y_train, marker, delta_train) c_index = max(c_index, 1 - c_index) screening_cox[feat_name] = c_index
def fit_Cox(features, T, E): cox_m = CoxRegression(verbose=True) cox_m.fit(features, T, E) return
# binarize data n_cuts = 50 binarizer = FeaturesBinarizer(n_cuts=n_cuts) X_bin = binarizer.fit_transform(X) blocks_start = binarizer.blocks_start blocks_length = binarizer.blocks_length boundaries = binarizer.boundaries tic = time() solver = 'agd' learner = CoxRegression(penalty='binarsity', tol=1e-5, solver=solver, verbose=False, max_iter=100, step=0.3, blocks_start=blocks_start, blocks_length=blocks_length, warm_start=True) learner._solver_obj.linesearch = False # cross-validation n_folds = 10 grid_size = 30 grid_C = np.logspace(0, 3, grid_size) scores_cv = pd.DataFrame(columns=['ll_test', 'test_std']) for i, C in enumerate(grid_C): stdout.write("\rbinacox n_samples: %s/%s, " "n_simu: %s/%s, " "CV: %d%%" %