def fit_and_prepare(x_train, y_train, test_df): # 3.1. Prepare Y----- y_train.specific_death = y_train.specific_death.astype(bool) # Transform it into a structured array y_train = y_train.to_records(index=False) # 3.2. Prepare X----- # obtain the x variables that are categorical categorical_feature_mask = x_train.dtypes == object # Filter categorical columns using mask and turn it into a list categorical_cols = x_train.columns[categorical_feature_mask].tolist() # Ensure categorical columns are category type for col in categorical_cols: x_train[col] = x_train[col].astype('category') test_df[col] = test_df[col].astype('category') # 3.3. Fit model----- # initiate encoder = OneHotEncoder() estimator = CoxPHSurvivalAnalysis() # fit model estimator.fit(encoder.fit_transform(x_train), y_train) # transform the test variables to match the train x_test = encoder.transform(test_df) return (estimator, x_test, x_train, y_train)
def fit_and_score_features(X, y): n_features = X.shape[1] scores = np.empty(n_features) m = CoxPHSurvivalAnalysis() for j in range(n_features): Xj = X[:, j:j+1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores
def test_brier_coxph(): X, y = load_gbsg2() X.loc[:, "tgrade"] = X.loc[:, "tgrade"].map(len).astype(int) Xt = OneHotEncoder().fit_transform(X) est = CoxPHSurvivalAnalysis(ties="efron").fit(Xt, y) survs = est.predict_survival_function(Xt) preds = [fn(1825) for fn in survs] _, score = brier_score(y, y, preds, 1825) assert round(abs(score[0] - 0.208817407492645), 5) == 0
def __init__(self): genes_filter = GenesFilter() scaler = StandardScaler() regressor = CoxPHSurvivalAnalysis() self.regr = Pipeline([('genes_filter', genes_filter), ('scaler', scaler), ('regressor', regressor)]) return
def mpss_ph_sksurv(self): """ Performs proportional hazards regression using sksurv package. :return: Feature importance """ # Reformat for sksurv package x_train = pd.DataFrame(self.x_train) y_structured = [ (ll, s) for ll, s in zip(self.y_train.astype(bool), self.scores) ] y_structured = np.array(y_structured, dtype=[('class', 'bool_'), ('score', 'single')]) # Remove any feature columns that are all 0 values, otherwise cannot run regression x_train_nonzero = x_train.loc[:, (x_train != 0).any(axis=0)] # Run proportional hazards regression estimator = CoxPHSurvivalAnalysis(alpha=0.1, verbose=1) estimator.fit(x_train_nonzero, y_structured) prediction = estimator.predict(x_train_nonzero) # Estimate p-values for each feature f, pvals = f_regression(x_train_nonzero, [x[1] for x in y_structured]) approximate_se = pd.DataFrame(pd.Series( pvals, index=x_train_nonzero.columns).sort_values(ascending=False), columns=['p']).reset_index() # Calculate concordance indicating the goodness of fit concordance = concordance_index_censored(self.y_train.astype(bool), self.scores, prediction) print('concordance', concordance[0]) # Dataframe with coefficients, absolute value of coefficients, and p-values importance = pd.DataFrame(estimator.coef_, columns=['coef']) importance['coef_abs'] = [math.fabs(c) for c in importance['coef']] importance['feature'] = importance.index.values importance = importance.merge(approximate_se, left_on='feature', right_on='index').drop('index', axis=1) # Sort feature importance importance = importance.sort_values( 'coef_abs', ascending=False).reset_index(drop=True) return importance
def cox(name): filename = filename_dict[name] raw_data = pd.read_csv(os.path.join(DATA_DIR, filename)) formatted_x, formatted_y = sksurv_data_formatting(raw_data) x_train, x_test, y_train, y_test = train_test_split( formatted_x, formatted_y, test_size=0.25, random_state=RANDOM_STATE) estimator = CoxPHSurvivalAnalysis() estimator.fit(x_train, y_train) prediction = estimator.predict(x_test) result = concordance_index_censored(y_test["Status"], y_test["Survival_in_days"], prediction) return result[0]
def train_coxph(data_df, r_splits): c_index_at = [] c_index_30 = [] time_auc_30 = [] time_auc_60 = [] time_auc_365 = [] for i in range(len(r_splits)): print("\nIteration %s"%(i)) #DATA PREP df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0]) (data_x, data_y), (val_x, val_y), (test_x, test_y), (test_30_x, test_30_y) = df2array(data_df, df_train, df_val, df_test, df_test_30) estimator = CoxPHSurvivalAnalysis(alpha=1e-04) estimator.fit(data_x, data_y) c_index_at.append(estimator.score(test_x, test_y)) c_index_30.append(estimator.score(test_30_x, test_30_y)) for time_x in [30, 60, 365]: t_auc, t_mean_auc = cumulative_dynamic_auc(data_y, test_y, estimator.predict(test_x), time_x) eval("time_auc_" + str(time_x)).append(t_auc[0]) print("C-index_30:", c_index_30[i]) print("C-index_AT:", c_index_at[i]) print("time_auc_30", time_auc_30[i]) print("time_auc_60", time_auc_60[i]) print("time_auc_365", time_auc_365[i]) return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
def test_predict_proba(self): meta = Stacking(_PredictDummy(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) self.assertRaisesRegex(AttributeError, "'_PredictDummy' object has no attribute 'predict_proba'", getattr, meta, "predict_proba")
def test_predict_proba(): meta = Stacking(_PredictDummy(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) with pytest.raises(AttributeError, match="'_PredictDummy' object has no attribute 'predict_proba'"): getattr(meta, "predict_proba")
def test_fit(self): meta = Stacking(MeanEstimator(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) self.assertEqual(2, len(meta)) meta.fit(self.x.values, self.y) p = meta._predict_estimators(self.x.values) self.assertTupleEqual((self.x.shape[0], 2), p.shape)
def test_set_params(self): meta = Stacking(_PredictDummy(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) meta.set_params(coxph__alpha=1.0, svm__alpha=0.4132) self.assertEqual(1.0, meta.get_params()["coxph__alpha"]) self.assertEqual(0.4132, meta.get_params()["svm__alpha"])
def test_score(make_whas500): whas500 = make_whas500(with_mean=False, with_std=False, to_numeric=True) meta = Stacking(MeanEstimator(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) meta.fit(whas500.x, whas500.y) c_index = meta.score(whas500.x, whas500.y) assert round(abs(c_index - 0.7848807), 5) == 0
def test_fit(make_whas500): whas500 = make_whas500(with_mean=False, with_std=False, to_numeric=True) meta = Stacking(MeanEstimator(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) assert 2 == len(meta) meta.fit(whas500.x, whas500.y) p = meta._predict_estimators(whas500.x) assert (whas500.x.shape[0], 2) == p.shape
def test_predict(self): meta = Stacking(MeanEstimator(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) meta.fit(self.x.values, self.y) # result is different if randomForestSRC has not been compiled with OpenMP support p = meta.predict(self.x.values) actual_cindex = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) expected_cindex = numpy.array([0.7848807, 58983, 16166, 0, 119]) assert_array_almost_equal(expected_cindex, actual_cindex)
def test_predict(make_whas500): whas500 = make_whas500(with_mean=False, with_std=False, to_numeric=True) meta = Stacking(MeanEstimator(), [('coxph', CoxPHSurvivalAnalysis()), ('svm', FastSurvivalSVM(random_state=0))], probabilities=False) meta.fit(whas500.x, whas500.y) # result is different if randomForestSRC has not been compiled with OpenMP support p = meta.predict(whas500.x) assert_cindex_almost_equal(whas500.y['fstat'], whas500.y['lenfol'], p, (0.7848807, 58983, 16166, 0, 14))
def __init__(self): super(CoxPH, self).__init__() # super().__init__() self.name = 'CoxPH' self.model = CoxPHSurvivalAnalysis( alpha=0.01) #otherwise error occured self.direction = 1 self.prob_FLAG = True self.explained = "*Cox proportional model" self.image_name = "Cox.png" self.image_size = (500, 500)
def __init__(self, alpha=10.0): super(CoxPHRidge, self).__init__() # super().__init__() self.alpha = alpha self.name = 'CoxPHRidge' self.model = CoxPHSurvivalAnalysis( alpha=self.alpha) #ridge regression penalty self.direction = 1 self.prob_FLAG = True self.explained = "*Cox proportional model with ridge regression" self.image_name = "CoxPHRidge.png" self.image_size = (500, 500)
def traditional_surv_analysis(datas, opts): # tidy data as ndarray train_X, train_Y = datas["train"].xs.numpy(), datas["train"].ys.numpy() test_X, test_Y = datas["test"].xs.numpy(), datas["test"].ys.numpy() if "val" in datas.keys(): train_X = np.concatenate([train_X, datas["val"].xs]) train_Y = np.concatenate([train_Y, datas["val"].ys]) # construct structured array train_Y = Surv.from_arrays(train_Y[:, 1].astype("bool"), train_Y[:, 0]) test_Y = Surv.from_arrays(test_Y[:, 1].astype("bool"), test_Y[:, 0]) # construct estimators estimators = { "CoxPH": CoxPHSurvivalAnalysis(), "CGBSA": CGBSA(n_estimators=500, random_state=opts.random_seed), "GBSA": GBSA(n_estimators=500, random_state=opts.random_seed), "FKSVM": FKSVM(random_state=opts.random_seed), "FSVM": FSVM(random_state=opts.random_seed) } # training for name, estimator in estimators.items(): print("%s training." % name) estimator.fit(train_X, train_Y) # evaluation train_scores = {} test_scores = {} for name, estimator in estimators.items(): print("%s evaluation." % name) train_scores[name] = estimator.score(train_X, train_Y) test_scores[name] = estimator.score(test_X, test_Y) # return return train_scores, test_scores
print("Linear.coef_: {}".format(Linear.coef_)) print("Linear.intercept_ : {}".format(Linear.intercept_)) print("훈련 세트의 정확도 : {:.2f}".format(Linear.score(X_train, y_train))) print("테스트 세트의 정확도 : {:.2f}".format(Linear.score(X_test, y_test))) # %% from sklearn.linear_model import Lasso Lasso = Lasso(alpha=0.000000000005, normalize=True).fit(X_train, y_train) print("Lasso.coef_: {}".format(Lasso.coef_)) print("Lasso.intercept_ : {}".format(Lasso.intercept_)) print("훈련 세트의 정확도 : {:.2f}".format(Lasso.score(X_train, y_train))) print("테스트 세트의 정확도 : {:.2f}".format(Lasso.score(X_test, y_test))) # %% Cox = CoxPHSurvivalAnalysis().fit( np.array(list(map(int, y_train)), df_LR[['w_SS']])) print("Cox.coef_: {}".format(Cox.coef_)) print("Cox.intercept_ : {}".format(Cox.intercept_)) print("훈련 세트의 정확도 : {:.2f}".format(Cox.score(X_train, y_train))) print("테스트 세트의 정확도 : {:.2f}".format(Cox.score(X_test, y_test))) # %% X = df_LR[[ "정규화_인구", "정규화_교통량_07", "정규화_교통량_15", "정규화_혼잡빈도강도합", "정규화_혼잡시간강도합", "정규화_자동차등록", "정규화_전기자동차등록" ]] # %% X = X.astype(float) Cox = CoxPHSurvivalAnalysis().fit(X, np.array(list(map(int, y_train)))) # %%
#data_x_1['date_weekday'] = data_x_1['date'].dt.weekday #data_x_1['date_hour'] = data_x_1['date'].dt.hour #data_x_1['date_minute'] = data_x_1['date'].dt.minute #data_x_1['date_second'] = data_x_1['date'].dt.second #data_x_1.drop(columns=['date','created', 'install_diff','device_brand','install_seconds','user_agent'], inplace=True) #data_x_1_numeric = pd.get_dummies(data_x_1, dummy_na=True, prefix_sep='=') #%% data_y_1 = np.fromiter(zip( data_full_1.head(100)["status_censored"], data_full_1.head(100)["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)]) #%% estimator = CoxPHSurvivalAnalysis(alpha=0.1) estimator.fit(data_x_1_numeric.head(100), data_y_1) estimator.score(data_x_1_numeric.head(100), data_y_1) #%% def fit_and_score_features(X, y, alpha=0.1): n_features = X.shape[1] scores = np.empty(n_features) m = CoxPHSurvivalAnalysis(alpha=alpha) for j in range(n_features): Xj = X[:, j:j + 1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores
# %% params = { 'regressor__model__gamma': (1e-6, 1e+6, 'log-uniform'), 'regressor__model__learning_rate': (1e-6, 1e+1, 'log-uniform'), 'regressor__model__max_depth': (1, 8), # integer valued parameter 'regressor__model__min_child_weight': (10, 500, 'log-uniform'), # categorical parameter 'regressor__model__n_estimators': (1, 8), # integer valued parameter 'regressor__model__reg_alpha': (1, 8, 'log-uniform'), # integer valued parameter 'regressor__model__reg_lambda': (1, 8, 'log-uniform'), # integer valued parameter 'regressor__model__subsample': (1, 8, 'log-uniform'), # integer valued parameter } #%% # Since sksurv output log hazard ratios (here relative to 0 on predictors) # we must use 'output_margin=True' for comparability. estimator = CoxPHSurvivalAnalysis().fit(data_x, data_y) gbm = xgb.XGBRegressor(objective='survival:cox', booster='gblinear', base_score=1, n_estimators=1000) search = BayesSearchCV(gbm, params, n_iter=3, cv=3) search.fit(data_x, data_y_xgb) #%% prediction_sksurv = estimator.predict(data_x) predictions_xgb = search.predict(data_x) d = pd.DataFrame({'xgb': predictions_xgb, 'sksurv': prediction_sksurv}) d.head()
def plot_cumulative_dynamic_auc(risk_score, label, color=None): auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times) plt.plot(times, auc, marker="o", color=color, label=label) plt.xlabel("days from enrollment") plt.ylabel("time-dependent AUC") plt.axhline(mean_auc, color=color, linestyle="--") plt.legend() for i, col in enumerate(num_columns): plot_cumulative_dynamic_auc(x_test[:, i], col, color="C{}".format(i)) ret = concordance_index_ipcw(y_train, y_test, x_test[:, i], tau=times[-1]) from sksurv.datasets import load_veterans_lung_cancer va_x, va_y = load_veterans_lung_cancer() cph = make_pipeline(OneHotEncoder(), CoxPHSurvivalAnalysis()) cph.fit(va_x, va_y) va_times = np.arange(7, 183, 7) # estimate performance on training data, thus use `va_y` twice. va_auc, va_mean_auc = cumulative_dynamic_auc(va_y, va_y, cph.predict(va_x), va_times) plt.plot(va_times, va_auc, marker="o") plt.axhline(va_mean_auc, linestyle="--") plt.xlabel("days from enrollment") plt.ylabel("time-dependent AUC") plt.grid(True)
trainData = np.concatenate([trainData, testData], 0) trainData = trainData[:, lowerbound:upperbound] data_y = trainData[:, :2] data_x = trainData[:, 2:] x, y = data_x.shape data_x += 0.001 * np.random.random((x, y)) gf_day = list(trainData[:, 0]) gf_1year_label = list(trainData[:, 1]) gf_1year_label = list(map(lambda x: x == 1, gf_1year_label)) dt = np.dtype('bool,float') data_y = [(gf_1year_label[i], gf_day[i]) for i in range(len(gf_1year_label))] data_y = np.array(data_y, dtype=dt) t1 = time() estimator = CoxPHSurvivalAnalysis() estimator.fit(data_x[:train_num], data_y[:train_num]) print('fitting estimate cost {} seconds'.format(int(time() - t1))) print(estimator.score(data_x[train_num:], data_y[train_num:])) ''' data_x, data_y = load_veterans_lung_cancer() #pd.DataFrame.from_records(data_y[[11, 5, 32, 13, 23]], index=range(1, 6)) time, survival_prob = kaplan_meier_estimator(data_y["Status"], data_y["Survival_in_days"]) plt.step(time, survival_prob, where="post") plt.ylabel("est. probability of survival $\hat{S}(t)$") plt.xlabel("time $t$") print(data_x["Treatment"].value_counts())
T = T * 7 # weeks --> days # DataFrame --> structured array dataframe = {'censor': C, 'time': T} # series --> DataFrame df_ss_tc = pd.DataFrame(dataframe) s = df_ss_tc.dtypes y_ss = np.array([tuple(x) for x in df_ss_tc.values], dtype=list(zip(s.index, s))) #%% # Conventional CPH #%% X_ss = df.copy(deep=True) cph = CoxPHSurvivalAnalysis() c_index['cph'] = cross_val_score(cph, X_ss, y_ss, cv=k_fold, scoring=c_index_scorer, verbose=1) # %% # Random Survival Forests (RSFs) #%% X_rsf = df.copy(deep=True) rsf = RandomSurvivalForest(random_state=SEED, n_jobs=-1, verbose=True) rsf.criterion = 'log_rank'
E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T X, y = get_x_y(df2, ['E', 'T'], pos_label=True) for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) #%% estimator = CoxPHSurvivalAnalysis(verbose=True, n_iter=10000) estimator.fit(data_x_numeric, y) #%% print() print(pd.Series(estimator.coef_, index=data_x_numeric.columns)) print() print(estimator.score(data_x_numeric, y)) print() scores = fit_and_score_features(data_x_numeric.values, y) print( pd.Series(scores, index=data_x_numeric.columns).sort_values(ascending=False)) #%%
# # The model that I will try first is a CoxPH model that expresses the hazard ration depending on the features and associated parameters that I want to evaluate. # # ### CoxPH model with features selection # # Since I need to test my model I will subdivise my dataset into smaller folds and perform cross-validation with 3 folds : the model will be fitted on 2 out of the 3 folds, tested on the tenth and the score (concordance index used in CoxPH models) will be calculated. This process is repeated 3 times and comparing the scores will give us an insight on the best model. # In[536]: from sksurv.linear_model import CoxPHSurvivalAnalysis from sklearn import model_selection output_train = output_train_ini[['Event', 'SurvivalTime']] output_train['Event'] = output_train['Event'].astype('bool') estimator = CoxPHSurvivalAnalysis(alpha=0.1) def create_apply_model(est, feature, target, nb_cv): #adding a little noise on the data to be sure that they are independant and do not raise linalg error n = np.size(feature, 0) p = np.size(feature, 1) cv_results = model_selection.cross_validate(est, feature.values + 0.00001 * np.random.rand(n, p), target.to_records(index=False), cv=nb_cv, return_estimator=True) best_estimator = cv_results['estimator'][np.where( cv_results['test_score'] == np.max(cv_results['test_score']))[0][0]]
## Correct the follow uo days less than 0 to 0 for idx, item in enumerate(data_y['time_to_event']): if item < 0: data_y['time_to_event'][idx] = 0 data_y df.groupby('status').count() ### Part 2: Cox-PH model with Ridge Penalty # tuning parameter alpha over grid search according to c-index from sksurv.linear_model import CoxPHSurvivalAnalysis from sklearn.model_selection import GridSearchCV coxph = CoxPHSurvivalAnalysis() grid_values = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]} grid_c = GridSearchCV(coxph, param_grid=grid_values, scoring=None) grid_c.fit(data_x, data_y) print('Grid best parameter (max c-index): ', grid_c.best_params_) print('Grid best score (c-index): ', grid_c.best_score_) # Apply Cox-PH model based on 3-fold 10-repeated CV using optimal alpha selected from grid search: from sklearn.model_selection import RepeatedKFold rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=0) # 3-fold 10-repeated CV
# ------------------- Compute C-Index for Individual features -------------------- results = {} logger.info("Start C-Index computation for each feature.") for anno in feat.keys(): n_feat = len(feat[anno]) for k_feat, f_name in enumerate(feat[anno].keys()): rater_dict = {'rank': [], 'icc': [], 'c-index': []} for rater in range(num_raters[anno]): x = feat[anno][f_name][:, rater][:, None] # (n_samples, m_features) y = survial_data[['event', 'time']] # Model predictor = CoxPHSurvivalAnalysis(alpha=0, n_iter=1e9) # Evaluation c_indexes = stratified_cross_val(predictor, x, y, n_splits=5).tolist() rater_dict['c-index'].append(c_indexes) # Option 1: Stability ranking based on this dataset rater_dict['rank'].append(sorted_rad_stab[f_name]) rater_dict['icc'].append(rad_metric_val[f_name]) # Option 2: Stability ranking based on all datasets # rater_dict['rank'].append(sorted_rad_stab_mul[f_name]) # rater_dict['icc'].append(rad_metric_val_mul[f_name]) logger.info(
data_y["Survival_in_days"][mask]) plt.step(time_cell, survival_prob_cell, where="post", label="%s (n = %d)" % (value, mask.sum())) plt.ylabel("est. probability of survival $\hat{S}(t)$") plt.xlabel("time $t$") plt.legend(loc="best") from sksurv.preprocessing import OneHotEncoder data_x_numeric = OneHotEncoder().fit_transform(data_x) data_x_numeric.head() from sksurv.linear_model import CoxPHSurvivalAnalysis estimator = CoxPHSurvivalAnalysis() estimator.fit(data_x_numeric, data_y) pd.Series(estimator.coef_, index=data_x_numeric.columns) x_new = pd.DataFrame.from_dict({ 1: [65, 0, 0, 1, 60, 1, 0, 1], 2: [65, 0, 0, 1, 60, 1, 0, 0], 3: [65, 0, 1, 0, 60, 1, 0, 0], 4: [65, 0, 1, 0, 60, 1, 0, 1]}, columns=data_x_numeric.columns, orient='index') x_new import numpy as np pred_surv = estimator.predict_survival_function(x_new)
data[['tum_area_from_vol', 'dens_ADCT2w']], data[['init_tum_area']], data[['TTum330_alphaG1120']] ] y = np.zeros(76, dtype={ 'names': ('bio_rec_6', 'bio_rec_6_delay'), 'formats': ('bool', 'int') }) y['bio_rec_6'] = data[['bio_rec_6']].to_numpy().ravel() y['bio_rec_6_delay'] = data[['bio_rec_6_delay']].to_numpy().ravel() nTest = 38 auc = np.zeros((len(tx), 3)) meanAuc = np.zeros((N, len(tx))) cph = CoxPHSurvivalAnalysis() sss = StratifiedShuffleSplit(n_splits=N, test_size=nTest) for i, x in enumerate(tx): x = x.to_numpy() j = 0 for train, test in sss.split(x, y['bio_rec_6']): xtrain, ytrain = x[train], y[train] xtest, ytest = x[test], y[test] cph.fit(xtrain, ytrain) ypred = cph.predict(xtrain) time = np.linspace( np.min(ytrain['bio_rec_6_delay']) + 1, np.max(ytrain['bio_rec_6_delay']) - 1, 3) auc[i, :], meanAuc[j, i] = cumulative_dynamic_auc(ytrain, ytrain, ypred,