Beispiel #1
0
def train_coxph(data_df, r_splits):
  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])

    (data_x, data_y), (val_x, val_y), (test_x, test_y), (test_30_x, test_30_y) = df2array(data_df, df_train, df_val, df_test, df_test_30)

    estimator = CoxPHSurvivalAnalysis(alpha=1e-04)
    estimator.fit(data_x, data_y)

    c_index_at.append(estimator.score(test_x, test_y))
    c_index_30.append(estimator.score(test_30_x, test_30_y))

    for time_x in [30, 60, 365]:
      t_auc, t_mean_auc = cumulative_dynamic_auc(data_y, test_y, estimator.predict(test_x), time_x)
      eval("time_auc_" + str(time_x)).append(t_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i]) 

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
def fit_and_score_features(X, y):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis()
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores
Beispiel #3
0
data_y = trainData[:, :2]
data_x = trainData[:, 2:]
x, y = data_x.shape
data_x += 0.001 * np.random.random((x, y))
gf_day = list(trainData[:, 0])
gf_1year_label = list(trainData[:, 1])
gf_1year_label = list(map(lambda x: x == 1, gf_1year_label))
dt = np.dtype('bool,float')
data_y = [(gf_1year_label[i], gf_day[i]) for i in range(len(gf_1year_label))]
data_y = np.array(data_y, dtype=dt)

t1 = time()
estimator = CoxPHSurvivalAnalysis()
estimator.fit(data_x[:train_num], data_y[:train_num])
print('fitting estimate cost {} seconds'.format(int(time() - t1)))
print(estimator.score(data_x[train_num:], data_y[train_num:]))
'''
data_x, data_y = load_veterans_lung_cancer()

#pd.DataFrame.from_records(data_y[[11, 5, 32, 13, 23]], index=range(1, 6))


time, survival_prob = kaplan_meier_estimator(data_y["Status"], data_y["Survival_in_days"])
plt.step(time, survival_prob, where="post")
plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")

print(data_x["Treatment"].value_counts())

for treatment_type in ("standard", "test"):
    mask_treat = data_x["Treatment"] == treatment_type
time_points = np.arange(1, 1000)
for i, surv_func in enumerate(pred_surv):
    plt.step(time_points, surv_func(time_points), where="post",
             label="Sample %d" % (i + 1))
plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")
plt.legend(loc="best")


from sksurv.metrics import concordance_index_censored

prediction = estimator.predict(data_x_numeric)
result = concordance_index_censored(data_y["Status"], data_y["Survival_in_days"], prediction)
result[0]

estimator.score(data_x_numeric, data_y)


# Feature selection
import numpy as np

def fit_and_score_features(X, y):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis()
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores
Beispiel #5
0
# %%
from sklearn.linear_model import Lasso
Lasso = Lasso(alpha=0.000000000005, normalize=True).fit(X_train, y_train)
print("Lasso.coef_: {}".format(Lasso.coef_))
print("Lasso.intercept_ : {}".format(Lasso.intercept_))
print("훈련 세트의 정확도 : {:.2f}".format(Lasso.score(X_train, y_train)))
print("테스트 세트의 정확도 : {:.2f}".format(Lasso.score(X_test, y_test)))

# %%

Cox = CoxPHSurvivalAnalysis().fit(
    np.array(list(map(int, y_train)), df_LR[['w_SS']]))
print("Cox.coef_: {}".format(Cox.coef_))
print("Cox.intercept_ : {}".format(Cox.intercept_))
print("훈련 세트의 정확도 : {:.2f}".format(Cox.score(X_train, y_train)))
print("테스트 세트의 정확도 : {:.2f}".format(Cox.score(X_test, y_test)))

# %%
X = df_LR[[
    "정규화_인구", "정규화_교통량_07", "정규화_교통량_15", "정규화_혼잡빈도강도합", "정규화_혼잡시간강도합",
    "정규화_자동차등록", "정규화_전기자동차등록"
]]

# %%
X = X.astype(float)
Cox = CoxPHSurvivalAnalysis().fit(X, np.array(list(map(int, y_train))))
# %%
np.array(df_LR[['w_SS']])
#%%
from sksurv.datasets import load_whas500
Beispiel #6
0
                        feat[anno][f_name][:, rater]
                        for f_name in selected_features[n_split]
                    ])
                    x = np.swapaxes(x, 0, 1)  # (n_samples, n_features)

                    x_train, x_test = x[split['train']], x[split['test']]
                    y_train, y_test = survial_data[
                        split['train']], survial_data[split['test']]

                    # Model
                    predictor = CoxPHSurvivalAnalysis(alpha=0, n_iter=1e9)

                    try:
                        predictor.fit(x_train, y_train[['event', 'time']])
                        c_indexes.append(
                            predictor.score(x_test, y_test[['event', 'time']]))
                        risk_score_train = predictor.predict(x_train)
                        risk_score = predictor.predict(x_test)
                        high_risk_masks.append(
                            risk_score > np.median(risk_score_train))
                        y_tests.append(y_test)
                    except Exception as e:
                        logger.warning("Error {}".format(str(e)))
                        c_indexes.append(np.NaN)

                # ----------------------- Kaplan-Meier --------------------------------
                high_risk_mask = np.concatenate(high_risk_masks)

                y_tests = np.concatenate(y_tests)
                y_high_risk, y_low_risk = y_tests[high_risk_mask], y_tests[
                    ~high_risk_mask]
coxph = CoxPHSurvivalAnalysis()
grid_values = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

grid_c = GridSearchCV(coxph, param_grid=grid_values, scoring=None)
grid_c.fit(data_x, data_y)

print('Grid best parameter (max c-index): ', grid_c.best_params_)
print('Grid best score (c-index): ', grid_c.best_score_)

# Apply Cox-PH model based on 3-fold 10-repeated CV using optimal alpha selected from grid search:

from sklearn.model_selection import RepeatedKFold

rkf = RepeatedKFold(n_splits=3, n_repeats=10,
                    random_state=0)  # 3-fold 10-repeated CV

c_index_train, c_index_test = [], []

for train_index, test_index in rkf.split(data_x):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]
    coxph = CoxPHSurvivalAnalysis(
        alpha=float(grid_c.best_params_['alpha'])).fit(x_train, y_train)
    c_index_train.append(coxph.score(x_train, y_train))
    c_index_test.append(coxph.score(x_test, y_test))

print("Averaged c-index from 3-fold 10 repeated CV(training): {:.3f}".format(
    np.mean(c_index_train)))
print("Averaged c-index from 3-fold 10 repeated CV(test): {:.3f}".format(
    np.mean(c_index_test)))
    for c in X.columns.values:
        if c != 'AGE AT DOC':
            X[c] = X[c].astype('category')

    data_x_numeric = OneHotEncoder().fit_transform(X)
    #%%

    estimator = CoxPHSurvivalAnalysis(verbose=True, n_iter=10000)
    estimator.fit(data_x_numeric, y)
    #%%

    print()
    print(pd.Series(estimator.coef_, index=data_x_numeric.columns))
    print()

    print(estimator.score(data_x_numeric, y))
    print()

    scores = fit_and_score_features(data_x_numeric.values, y)
    print(
        pd.Series(scores,
                  index=data_x_numeric.columns).sort_values(ascending=False))
    #%%

    from sklearn.feature_selection import SelectKBest
    from sklearn.pipeline import Pipeline

    pipe = Pipeline([('encode', OneHotEncoder()),
                     ('select', SelectKBest(fit_and_score_features, k=3)),
                     ('model', CoxPHSurvivalAnalysis(verbose=True,
                                                     n_iter=10000))])
Beispiel #9
0
#data_x_1['date_minute'] = data_x_1['date'].dt.minute
#data_x_1['date_second'] = data_x_1['date'].dt.second
#data_x_1.drop(columns=['date','created', 'install_diff','device_brand','install_seconds','user_agent'], inplace=True)
#data_x_1_numeric = pd.get_dummies(data_x_1, dummy_na=True, prefix_sep='=')

#%%
data_y_1 = np.fromiter(zip(
    data_full_1.head(100)["status_censored"],
    data_full_1.head(100)["in_seconds"]),
                       dtype=[('status_censored', np.bool),
                              ('in_seconds', np.float64)])

#%%
estimator = CoxPHSurvivalAnalysis(alpha=0.1)
estimator.fit(data_x_1_numeric.head(100), data_y_1)
estimator.score(data_x_1_numeric.head(100), data_y_1)


#%%
def fit_and_score_features(X, y, alpha=0.1):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=alpha)
    for j in range(n_features):
        Xj = X[:, j:j + 1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores


#%%