def train_coxph(data_df, r_splits): c_index_at = [] c_index_30 = [] time_auc_30 = [] time_auc_60 = [] time_auc_365 = [] for i in range(len(r_splits)): print("\nIteration %s"%(i)) #DATA PREP df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0]) (data_x, data_y), (val_x, val_y), (test_x, test_y), (test_30_x, test_30_y) = df2array(data_df, df_train, df_val, df_test, df_test_30) estimator = CoxPHSurvivalAnalysis(alpha=1e-04) estimator.fit(data_x, data_y) c_index_at.append(estimator.score(test_x, test_y)) c_index_30.append(estimator.score(test_30_x, test_30_y)) for time_x in [30, 60, 365]: t_auc, t_mean_auc = cumulative_dynamic_auc(data_y, test_y, estimator.predict(test_x), time_x) eval("time_auc_" + str(time_x)).append(t_auc[0]) print("C-index_30:", c_index_30[i]) print("C-index_AT:", c_index_at[i]) print("time_auc_30", time_auc_30[i]) print("time_auc_60", time_auc_60[i]) print("time_auc_365", time_auc_365[i]) return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
def mpss_ph_sksurv(self): """ Performs proportional hazards regression using sksurv package. :return: Feature importance """ # Reformat for sksurv package x_train = pd.DataFrame(self.x_train) y_structured = [ (ll, s) for ll, s in zip(self.y_train.astype(bool), self.scores) ] y_structured = np.array(y_structured, dtype=[('class', 'bool_'), ('score', 'single')]) # Remove any feature columns that are all 0 values, otherwise cannot run regression x_train_nonzero = x_train.loc[:, (x_train != 0).any(axis=0)] # Run proportional hazards regression estimator = CoxPHSurvivalAnalysis(alpha=0.1, verbose=1) estimator.fit(x_train_nonzero, y_structured) prediction = estimator.predict(x_train_nonzero) # Estimate p-values for each feature f, pvals = f_regression(x_train_nonzero, [x[1] for x in y_structured]) approximate_se = pd.DataFrame(pd.Series( pvals, index=x_train_nonzero.columns).sort_values(ascending=False), columns=['p']).reset_index() # Calculate concordance indicating the goodness of fit concordance = concordance_index_censored(self.y_train.astype(bool), self.scores, prediction) print('concordance', concordance[0]) # Dataframe with coefficients, absolute value of coefficients, and p-values importance = pd.DataFrame(estimator.coef_, columns=['coef']) importance['coef_abs'] = [math.fabs(c) for c in importance['coef']] importance['feature'] = importance.index.values importance = importance.merge(approximate_se, left_on='feature', right_on='index').drop('index', axis=1) # Sort feature importance importance = importance.sort_values( 'coef_abs', ascending=False).reset_index(drop=True) return importance
def cox(name): filename = filename_dict[name] raw_data = pd.read_csv(os.path.join(DATA_DIR, filename)) formatted_x, formatted_y = sksurv_data_formatting(raw_data) x_train, x_test, y_train, y_test = train_test_split( formatted_x, formatted_y, test_size=0.25, random_state=RANDOM_STATE) estimator = CoxPHSurvivalAnalysis() estimator.fit(x_train, y_train) prediction = estimator.predict(x_test) result = concordance_index_censored(y_test["Status"], y_test["Survival_in_days"], prediction) return result[0]
import numpy as np pred_surv = estimator.predict_survival_function(x_new) time_points = np.arange(1, 1000) for i, surv_func in enumerate(pred_surv): plt.step(time_points, surv_func(time_points), where="post", label="Sample %d" % (i + 1)) plt.ylabel("est. probability of survival $\hat{S}(t)$") plt.xlabel("time $t$") plt.legend(loc="best") from sksurv.metrics import concordance_index_censored prediction = estimator.predict(data_x_numeric) result = concordance_index_censored(data_y["Status"], data_y["Survival_in_days"], prediction) result[0] estimator.score(data_x_numeric, data_y) # Feature selection import numpy as np def fit_and_score_features(X, y): n_features = X.shape[1] scores = np.empty(n_features) m = CoxPHSurvivalAnalysis() for j in range(n_features): Xj = X[:, j:j+1]
for f_name in selected_features[n_split] ]) x = np.swapaxes(x, 0, 1) # (n_samples, n_features) x_train, x_test = x[split['train']], x[split['test']] y_train, y_test = survial_data[ split['train']], survial_data[split['test']] # Model predictor = CoxPHSurvivalAnalysis(alpha=0, n_iter=1e9) try: predictor.fit(x_train, y_train[['event', 'time']]) c_indexes.append( predictor.score(x_test, y_test[['event', 'time']])) risk_score_train = predictor.predict(x_train) risk_score = predictor.predict(x_test) high_risk_masks.append( risk_score > np.median(risk_score_train)) y_tests.append(y_test) except Exception as e: logger.warning("Error {}".format(str(e))) c_indexes.append(np.NaN) # ----------------------- Kaplan-Meier -------------------------------- high_risk_mask = np.concatenate(high_risk_masks) y_tests = np.concatenate(y_tests) y_high_risk, y_low_risk = y_tests[high_risk_mask], y_tests[ ~high_risk_mask]
'regressor__model__min_child_weight': (10, 500, 'log-uniform'), # categorical parameter 'regressor__model__n_estimators': (1, 8), # integer valued parameter 'regressor__model__reg_alpha': (1, 8, 'log-uniform'), # integer valued parameter 'regressor__model__reg_lambda': (1, 8, 'log-uniform'), # integer valued parameter 'regressor__model__subsample': (1, 8, 'log-uniform'), # integer valued parameter } #%% # Since sksurv output log hazard ratios (here relative to 0 on predictors) # we must use 'output_margin=True' for comparability. estimator = CoxPHSurvivalAnalysis().fit(data_x, data_y) gbm = xgb.XGBRegressor(objective='survival:cox', booster='gblinear', base_score=1, n_estimators=1000) search = BayesSearchCV(gbm, params, n_iter=3, cv=3) search.fit(data_x, data_y_xgb) #%% prediction_sksurv = estimator.predict(data_x) predictions_xgb = search.predict(data_x) d = pd.DataFrame({'xgb': predictions_xgb, 'sksurv': prediction_sksurv}) d.head() # %% context.io.save('xente_xgb', gbm) # %%