def test_fit_custom_kernel(self): alphas = numpy.exp(numpy.linspace(numpy.log(0.001), numpy.log(0.5), 5)) svm_grid = ParameterGrid({"alpha": alphas}) transform = ClinicalKernelTransform(fit_once=True) transform.prepare(self.x) base_estimators = [] for i, params in enumerate(svm_grid): model = FastSurvivalSVM(max_iter=100, random_state=0, **params) base_estimators.append(("svm_linear_%d" % i, model)) for i, params in enumerate(svm_grid): model = FastKernelSurvivalSVM(kernel=transform.pairwise_kernel, max_iter=45, tol=1e-5, random_state=0, **params) base_estimators.append(("svm_kernel_%d" % i, model)) cv = KFold(n_splits=3, shuffle=True, random_state=0) meta = EnsembleSelection(base_estimators, n_estimators=0.4, scorer=score_cindex, cv=cv, n_jobs=4) meta.fit(self.x.values, self.y) self.assertEqual(len(meta), 10) self.assertTupleEqual(meta.scores_.shape, (10, )) p = meta.predict(self.x.values) score = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) expected_score = numpy.array([0.7978084, 59938, 15178, 33, 119]) assert_array_almost_equal(score, expected_score)
def _create_regression_ensemble(): aft_grid = ParameterGrid({"alpha": 2.**numpy.arange(-2, 12, 2)}) svm_grid = ParameterGrid({"alpha": 2.**numpy.arange(-12, 0, 2)}) base_estimators = [] for i, params in enumerate(aft_grid): model = IPCRidge(max_iter=1000, **params) base_estimators.append(("aft_%d" % i, model)) for i, params in enumerate(svm_grid): model = FastSurvivalSVM(rank_ratio=0, fit_intercept=True, max_iter=100, random_state=1, **params) base_estimators.append(("svm_%d" % i, model)) cv = KFold(n_splits=4, shuffle=True, random_state=0) meta = EnsembleSelectionRegressor(base_estimators, n_estimators=0.4, scorer=_score_rmse, cv=cv, n_jobs=1) return meta
def _create_ensemble(self, **kwargs): boosting_grid = ParameterGrid({ "n_estimators": [100, 250], "subsample": [1.0, 0.75, 0.5] }) svm_grid = ParameterGrid({"alpha": 2.**numpy.arange(-9, 5, 2)}) base_estimators = [] for i, params in enumerate(boosting_grid): model = ComponentwiseGradientBoostingSurvivalAnalysis( random_state=0, **params) base_estimators.append(("gbm_%d" % i, model)) for i, params in enumerate(svm_grid): model = FastSurvivalSVM(max_iter=100, random_state=0, **params) base_estimators.append(("svm_%d" % i, model)) cv = KFold(n_splits=4, shuffle=True, random_state=0) meta = EnsembleSelection(base_estimators, n_estimators=0.4, scorer=score_cindex, cv=cv, **kwargs) return meta
for idx, item in enumerate(data_y['time_to_event']): if item < 0: data_y['time_to_event'][idx] = 0 # data_y # df.groupby('status').count() # Part 2: Fast Training of Support Vector Machines for Survival Analysis from sklearn.model_selection import ShuffleSplit, GridSearchCV from sksurv.column import encode_categorical from sksurv.metrics import concordance_index_censored from sksurv.svm import FastSurvivalSVM ## create estimator estimator = FastSurvivalSVM(optimizer="rbtree", max_iter=1000, tol=1e-6, random_state=0) pd.DataFrame(data_y)['status'].count() ## define a function for evaluating the performance of models during grid search using Harrell's concordance index def score_survival_model(model, X, y): prediction = model.predict(X) result = concordance_index_censored(y['status'], y['time_to_event'], prediction) return result[0] param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000]} cv = ShuffleSplit(n_splits=200, test_size=0.3, random_state=0)
def apply_survival_regression(col_list, sub_restricted_tmp_df, regression_df, folder, ith_method, additional_cols, loc, tcga_y, test_res_df, extra_pred_train=None, extra_pred_test=None, repeats=0): X = pd.get_dummies(sub_restricted_tmp_df[col_list]) X_train, X_test = X.loc[train_index, X.columns[2:]], X.loc[test_index, X.columns[2:]] y_train, y_test = tcga_y[train_index], tcga_y[test_index] X_train = X_train.loc[:, X_train.nunique() != 1] X_test = X_test.loc[:, X_train.columns] scaler = preprocessing.StandardScaler().fit(X_train) X_train_s = scaler.transform(X_train) X_test_s = scaler.transform(X_test) XX_train = X.loc[train_index] XX_test = X.loc[test_index] XX_train = XX_train.loc[:, XX_train.nunique() != 1] XX_test = XX_test.loc[:, XX_train.columns] # survival svm lin_svm = FastSurvivalSVM(rank_ratio=0.8, fit_intercept=True, max_iter=200) lin_svm.fit(X_train_s, y_train) T_pred_train = lin_svm.predict(X_train_s) ci_train, pval_train = my_ci_pvalue(XX_train['survival_days'], T_pred_train, XX_train['binary_vital_status'], repeats) T_pred_test = lin_svm.predict(X_test_s) ci_test, pval_test = my_ci_pvalue(XX_test['survival_days'], T_pred_test, XX_test['binary_vital_status'], repeats) variable_base_name = [ r.split('_{}_'.format(folder))[1] if ith_method in r else r for r in X_train.columns ] coef_dict = dict( zip(['{}_coef'.format(c) for c in variable_base_name], lin_svm.coef_)) t = pd.DataFrame( { 'cancer_loc': loc, 'ith_method': ith_method, 'folder': folder, 'additional_cols': additional_cols[0], 'regression_method': 'linear_survival_svm', 'train_score': ci_train, 'pval_train': pval_train, 'test_score': ci_test, 'pval_test': pval_test, **coef_dict }, index=[0]) regression_df = pd.concat((regression_df, t), sort=True) t = pd.DataFrame( { 'ith_method': ith_method, 'folder': folder, 'additional_cols': additional_cols[0], 'train_score': ci_train, 'pval_train': pval_train, 'test_score': ci_test, 'pval_test': pval_test, **{ obs_idx: T_pred_test[obs_idx] for obs_idx in range(len(T_pred_test)) } }, index=[0]) test_res_df = pd.concat((test_res_df, t), sort=True) if extra_pred_test is not None: for i, c in enumerate(extra_pred_test): ci_test, pval_test = my_ci_pvalue( XX_test['survival_days'], T_pred_test + extra_pred_test[i], XX_test['binary_vital_status'], 0) ci_train, pval_train = my_ci_pvalue( XX_train['survival_days'], T_pred_train + extra_pred_train[i], XX_train['binary_vital_status'], 0) t = pd.DataFrame( { 'cancer_loc': loc, 'ith_method': ith_method, 'folder': folder, 'additional_cols': additional_cols[i + 1], 'regression_method': 'linear_survival_svm', 'train_score': ci_train, 'pval_train': pval_train, 'test_score': ci_test, 'pval_test': pval_test }, index=[0]) regression_df = pd.concat((regression_df, t), sort=True) t = pd.DataFrame( { 'ith_method': ith_method, 'folder': folder, 'additional_cols': additional_cols[i + 1], 'train_score': ci_train, 'pval_train': pval_train, 'test_score': ci_test, 'pval_test': pval_test, **{ obs_idx: T_pred_test + extra_pred_test[i][obs_idx] for obs_idx in range( len(T_pred_test + extra_pred_test[i])) } }, index=[0]) test_res_df = pd.concat((test_res_df, t), sort=True) return regression_df, test_res_df, T_pred_test, T_pred_train
def RandomGridSearchRFC_Fixed(X, Y, splits, model, survival): """ This function looks for the best set o parameters for RFC method Input: X: training set Y: labels of training set splits: cross validation splits, used to make sure the parameters are stable Output: clf.best_params_: dictionary with the parameters, to use: param_svm['kernel'] """ start_svm = time.time() if model == 'svm': clf = svm.SVC() tuned_parameters = { 'C': ([0.01, 1, 10]), 'kernel': (['rbf', 'linear']), # 'kernel': (['linear', 'rbf', 'sigmoid']), # 'degree': ([1,3,5,10]), # 'decision_function_shape' : (['ovo', 'ovr']), # 'cache_size': ([500,1000,1500,2000]), 'shrinking': ([False, True]), # 'probability': ([False, True]) } if model == 'cart': clf = tree.DecisionTreeClassifier() tuned_parameters = { 'criterion': (['gini', 'entropy']), 'max_depth': ([10, 20]), 'min_samples_split': ([2, 3, 5]), 'min_samples_leaf': ([2, 3, 5]), } if model == 'rf': clf = ensemble.RandomForestClassifier() tuned_parameters = { 'n_estimators': ([200, 500, 1000]), # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]), 'max_depth': ([10, 20]), # 'criterion': (['gini', 'entropy']), 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [2, 3, 5], } if model == 'xgboost': clf = XGBClassifier() tuned_parameters = { 'booster': (['gbtree']), 'max_depth': ([5, 10, 20]), 'reg_lambda': ([0, 1]), 'reg_alpha': ([0, 1]), 'subsample': ([0.5, 1]) } if model == 'lr': clf = linear_model.LogisticRegression() tuned_parameters = {'solver': (['liblinear', 'sag', 'saga'])} if model == 'cox': clf = CoxnetSurvivalAnalysis() tuned_parameters = { 'n_alphas': ([50, 100, 200]), 'l1_ratio': ([0.1, 0.5, 1]), } if model == 'survSVM': clf = FastSurvivalSVM() tuned_parameters = { 'alpha': ([0.5, 1]), 'rank_ratio': ([0.5, 1]), 'max_iter': ([20, 40, 80]), 'optimizer': (['rbtree', 'avltree']), } if model == 'gb': clf = GradientBoostingSurvivalAnalysis() tuned_parameters = { 'learning_rate': ([0.1, 0.3]), 'n_estimators': ([100, 200, 400]), 'max_depth': ([3, 6, 12]) } if survival == True: scorer = make_scorer(CI, greater_is_better=True) y_for_cv = np.array([t[0] for t in Y]) cv = StratifiedKFold(y_for_cv, n_folds=splits) # x-validation else: cv = StratifiedKFold(Y, n_folds=splits) # x-validation scores = ['roc_auc'] print(' ...performing x-validation') clf = GridSearchCV(clf, tuned_parameters, scoring='%s' % scores[0], cv=cv, verbose=10) #scoring='%s' % scores[0] clf.fit(X, Y) end_svm = time.time() print("Total time to process: ", end_svm - start_svm) return (clf.best_params_, clf)
regr_best = CoxnetSurvivalAnalysis(alphas=gcv.best_params_["alphas"], l1_ratio=0.8, alpha_min_ratio=0.1, max_iter=3e5).fit(X, Y) y_regr = regr_best.predict(X_lb) ci_lb = concordance_index_censored(Y_lb["vitalStatus"], Y_lb["overallSurvival"], y_regr)[0] print("concordance index = %0.4f" % ci_lb) # In[ ] zero_mask = np.array([Y[ii][1] == 0 for ii in range(len(Y))]) surv_mdl = FastSurvivalSVM(rank_ratio=0.8, fit_intercept=True, optimizer="rbtree", tol=1e-4, max_iter=100, random_state=0) param_grid = {'alpha': np.logspace(-2, 2, num=100)} cv = KFold(n_splits=5, shuffle=True, random_state=0) grid_cv = GridSearchCV(surv_mdl, param_grid, scoring=score_survival_model, n_jobs=-1, cv=cv) grid_cv.fit(X[~zero_mask], Y[~zero_mask]) plot_gridcv_results(grid_cv, param_grid["alpha"]) surv_mdl_best = FastSurvivalSVM(alpha=grid_cv.best_params_["alpha"], rank_ratio=0.8,
print("\n") print("%.1f%% of records are censored" % (n_censored / y.shape[0] * 100)) # Dibujando plt.figure(figsize=(9, 6)) val, bins, patches = plt.hist( (y["Survival_in_days"][y["Status"]], y["Survival_in_days"][~y["Status"]]), bins=30, stacked=True) plt.legend(patches, ["Time of Death", "Time of Censoring"]) # First, we need to create an initial model with default parameters # that is subsequently used in the grid search. estimator = FastSurvivalSVM(optimizer="rbtree", max_iter=1000, tol=1e-6, random_state=0) # Creando la metrica def score_survival_model(model, X, y): prediction = model.predict(X) result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction) return result[0] param_grid = {'alpha': 2.**np.arange(-12, 13, 2)} cv = ShuffleSplit(n_splits=200, test_size=0.5, random_state=0) gcv = GridSearchCV(estimator, param_grid,