def train_coxph(data_df, r_splits): c_index_at = [] c_index_30 = [] time_auc_30 = [] time_auc_60 = [] time_auc_365 = [] for i in range(len(r_splits)): print("\nIteration %s"%(i)) #DATA PREP df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0]) (data_x, data_y), (val_x, val_y), (test_x, test_y), (test_30_x, test_30_y) = df2array(data_df, df_train, df_val, df_test, df_test_30) estimator = CoxPHSurvivalAnalysis(alpha=1e-04) estimator.fit(data_x, data_y) c_index_at.append(estimator.score(test_x, test_y)) c_index_30.append(estimator.score(test_30_x, test_30_y)) for time_x in [30, 60, 365]: t_auc, t_mean_auc = cumulative_dynamic_auc(data_y, test_y, estimator.predict(test_x), time_x) eval("time_auc_" + str(time_x)).append(t_auc[0]) print("C-index_30:", c_index_30[i]) print("C-index_AT:", c_index_at[i]) print("time_auc_30", time_auc_30[i]) print("time_auc_60", time_auc_60[i]) print("time_auc_365", time_auc_365[i]) return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
def fit_and_score_features(X, y): n_features = X.shape[1] scores = np.empty(n_features) m = CoxPHSurvivalAnalysis() for j in range(n_features): Xj = X[:, j:j+1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores
data_y = trainData[:, :2] data_x = trainData[:, 2:] x, y = data_x.shape data_x += 0.001 * np.random.random((x, y)) gf_day = list(trainData[:, 0]) gf_1year_label = list(trainData[:, 1]) gf_1year_label = list(map(lambda x: x == 1, gf_1year_label)) dt = np.dtype('bool,float') data_y = [(gf_1year_label[i], gf_day[i]) for i in range(len(gf_1year_label))] data_y = np.array(data_y, dtype=dt) t1 = time() estimator = CoxPHSurvivalAnalysis() estimator.fit(data_x[:train_num], data_y[:train_num]) print('fitting estimate cost {} seconds'.format(int(time() - t1))) print(estimator.score(data_x[train_num:], data_y[train_num:])) ''' data_x, data_y = load_veterans_lung_cancer() #pd.DataFrame.from_records(data_y[[11, 5, 32, 13, 23]], index=range(1, 6)) time, survival_prob = kaplan_meier_estimator(data_y["Status"], data_y["Survival_in_days"]) plt.step(time, survival_prob, where="post") plt.ylabel("est. probability of survival $\hat{S}(t)$") plt.xlabel("time $t$") print(data_x["Treatment"].value_counts()) for treatment_type in ("standard", "test"): mask_treat = data_x["Treatment"] == treatment_type
time_points = np.arange(1, 1000) for i, surv_func in enumerate(pred_surv): plt.step(time_points, surv_func(time_points), where="post", label="Sample %d" % (i + 1)) plt.ylabel("est. probability of survival $\hat{S}(t)$") plt.xlabel("time $t$") plt.legend(loc="best") from sksurv.metrics import concordance_index_censored prediction = estimator.predict(data_x_numeric) result = concordance_index_censored(data_y["Status"], data_y["Survival_in_days"], prediction) result[0] estimator.score(data_x_numeric, data_y) # Feature selection import numpy as np def fit_and_score_features(X, y): n_features = X.shape[1] scores = np.empty(n_features) m = CoxPHSurvivalAnalysis() for j in range(n_features): Xj = X[:, j:j+1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores
# %% from sklearn.linear_model import Lasso Lasso = Lasso(alpha=0.000000000005, normalize=True).fit(X_train, y_train) print("Lasso.coef_: {}".format(Lasso.coef_)) print("Lasso.intercept_ : {}".format(Lasso.intercept_)) print("훈련 세트의 정확도 : {:.2f}".format(Lasso.score(X_train, y_train))) print("테스트 세트의 정확도 : {:.2f}".format(Lasso.score(X_test, y_test))) # %% Cox = CoxPHSurvivalAnalysis().fit( np.array(list(map(int, y_train)), df_LR[['w_SS']])) print("Cox.coef_: {}".format(Cox.coef_)) print("Cox.intercept_ : {}".format(Cox.intercept_)) print("훈련 세트의 정확도 : {:.2f}".format(Cox.score(X_train, y_train))) print("테스트 세트의 정확도 : {:.2f}".format(Cox.score(X_test, y_test))) # %% X = df_LR[[ "정규화_인구", "정규화_교통량_07", "정규화_교통량_15", "정규화_혼잡빈도강도합", "정규화_혼잡시간강도합", "정규화_자동차등록", "정규화_전기자동차등록" ]] # %% X = X.astype(float) Cox = CoxPHSurvivalAnalysis().fit(X, np.array(list(map(int, y_train)))) # %% np.array(df_LR[['w_SS']]) #%% from sksurv.datasets import load_whas500
feat[anno][f_name][:, rater] for f_name in selected_features[n_split] ]) x = np.swapaxes(x, 0, 1) # (n_samples, n_features) x_train, x_test = x[split['train']], x[split['test']] y_train, y_test = survial_data[ split['train']], survial_data[split['test']] # Model predictor = CoxPHSurvivalAnalysis(alpha=0, n_iter=1e9) try: predictor.fit(x_train, y_train[['event', 'time']]) c_indexes.append( predictor.score(x_test, y_test[['event', 'time']])) risk_score_train = predictor.predict(x_train) risk_score = predictor.predict(x_test) high_risk_masks.append( risk_score > np.median(risk_score_train)) y_tests.append(y_test) except Exception as e: logger.warning("Error {}".format(str(e))) c_indexes.append(np.NaN) # ----------------------- Kaplan-Meier -------------------------------- high_risk_mask = np.concatenate(high_risk_masks) y_tests = np.concatenate(y_tests) y_high_risk, y_low_risk = y_tests[high_risk_mask], y_tests[ ~high_risk_mask]
coxph = CoxPHSurvivalAnalysis() grid_values = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]} grid_c = GridSearchCV(coxph, param_grid=grid_values, scoring=None) grid_c.fit(data_x, data_y) print('Grid best parameter (max c-index): ', grid_c.best_params_) print('Grid best score (c-index): ', grid_c.best_score_) # Apply Cox-PH model based on 3-fold 10-repeated CV using optimal alpha selected from grid search: from sklearn.model_selection import RepeatedKFold rkf = RepeatedKFold(n_splits=3, n_repeats=10, random_state=0) # 3-fold 10-repeated CV c_index_train, c_index_test = [], [] for train_index, test_index in rkf.split(data_x): x_train, x_test = data_x[train_index], data_x[test_index] y_train, y_test = data_y[train_index], data_y[test_index] coxph = CoxPHSurvivalAnalysis( alpha=float(grid_c.best_params_['alpha'])).fit(x_train, y_train) c_index_train.append(coxph.score(x_train, y_train)) c_index_test.append(coxph.score(x_test, y_test)) print("Averaged c-index from 3-fold 10 repeated CV(training): {:.3f}".format( np.mean(c_index_train))) print("Averaged c-index from 3-fold 10 repeated CV(test): {:.3f}".format( np.mean(c_index_test)))
for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) #%% estimator = CoxPHSurvivalAnalysis(verbose=True, n_iter=10000) estimator.fit(data_x_numeric, y) #%% print() print(pd.Series(estimator.coef_, index=data_x_numeric.columns)) print() print(estimator.score(data_x_numeric, y)) print() scores = fit_and_score_features(data_x_numeric.values, y) print( pd.Series(scores, index=data_x_numeric.columns).sort_values(ascending=False)) #%% from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline pipe = Pipeline([('encode', OneHotEncoder()), ('select', SelectKBest(fit_and_score_features, k=3)), ('model', CoxPHSurvivalAnalysis(verbose=True, n_iter=10000))])
#data_x_1['date_minute'] = data_x_1['date'].dt.minute #data_x_1['date_second'] = data_x_1['date'].dt.second #data_x_1.drop(columns=['date','created', 'install_diff','device_brand','install_seconds','user_agent'], inplace=True) #data_x_1_numeric = pd.get_dummies(data_x_1, dummy_na=True, prefix_sep='=') #%% data_y_1 = np.fromiter(zip( data_full_1.head(100)["status_censored"], data_full_1.head(100)["in_seconds"]), dtype=[('status_censored', np.bool), ('in_seconds', np.float64)]) #%% estimator = CoxPHSurvivalAnalysis(alpha=0.1) estimator.fit(data_x_1_numeric.head(100), data_y_1) estimator.score(data_x_1_numeric.head(100), data_y_1) #%% def fit_and_score_features(X, y, alpha=0.1): n_features = X.shape[1] scores = np.empty(n_features) m = CoxPHSurvivalAnalysis(alpha=alpha) for j in range(n_features): Xj = X[:, j:j + 1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores #%%