Example #1
0
def train_coxph(data_df, r_splits):
  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])

    (data_x, data_y), (val_x, val_y), (test_x, test_y), (test_30_x, test_30_y) = df2array(data_df, df_train, df_val, df_test, df_test_30)

    estimator = CoxPHSurvivalAnalysis(alpha=1e-04)
    estimator.fit(data_x, data_y)

    c_index_at.append(estimator.score(test_x, test_y))
    c_index_30.append(estimator.score(test_30_x, test_30_y))

    for time_x in [30, 60, 365]:
      t_auc, t_mean_auc = cumulative_dynamic_auc(data_y, test_y, estimator.predict(test_x), time_x)
      eval("time_auc_" + str(time_x)).append(t_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i]) 

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
Example #2
0
    def mpss_ph_sksurv(self):
        """

        Performs proportional hazards regression using sksurv package.

        :return: Feature importance
        """
        # Reformat for sksurv package
        x_train = pd.DataFrame(self.x_train)
        y_structured = [
            (ll, s) for ll, s in zip(self.y_train.astype(bool), self.scores)
        ]
        y_structured = np.array(y_structured,
                                dtype=[('class', 'bool_'),
                                       ('score', 'single')])

        # Remove any feature columns that are all 0 values, otherwise cannot run regression
        x_train_nonzero = x_train.loc[:, (x_train != 0).any(axis=0)]

        # Run proportional hazards regression
        estimator = CoxPHSurvivalAnalysis(alpha=0.1, verbose=1)
        estimator.fit(x_train_nonzero, y_structured)
        prediction = estimator.predict(x_train_nonzero)

        # Estimate p-values for each feature
        f, pvals = f_regression(x_train_nonzero, [x[1] for x in y_structured])
        approximate_se = pd.DataFrame(pd.Series(
            pvals, index=x_train_nonzero.columns).sort_values(ascending=False),
                                      columns=['p']).reset_index()

        # Calculate concordance indicating the goodness of fit
        concordance = concordance_index_censored(self.y_train.astype(bool),
                                                 self.scores, prediction)
        print('concordance', concordance[0])

        # Dataframe with coefficients, absolute value of coefficients, and p-values
        importance = pd.DataFrame(estimator.coef_, columns=['coef'])
        importance['coef_abs'] = [math.fabs(c) for c in importance['coef']]
        importance['feature'] = importance.index.values
        importance = importance.merge(approximate_se,
                                      left_on='feature',
                                      right_on='index').drop('index', axis=1)

        # Sort feature importance
        importance = importance.sort_values(
            'coef_abs', ascending=False).reset_index(drop=True)
        return importance
Example #3
0
def cox(name):

    filename = filename_dict[name]
    raw_data = pd.read_csv(os.path.join(DATA_DIR, filename))
    formatted_x, formatted_y = sksurv_data_formatting(raw_data)

    x_train, x_test, y_train, y_test = train_test_split(
        formatted_x, formatted_y, test_size=0.25, random_state=RANDOM_STATE)

    estimator = CoxPHSurvivalAnalysis()
    estimator.fit(x_train, y_train)

    prediction = estimator.predict(x_test)
    result = concordance_index_censored(y_test["Status"],
                                        y_test["Survival_in_days"], prediction)

    return result[0]
import numpy as np

pred_surv = estimator.predict_survival_function(x_new)
time_points = np.arange(1, 1000)
for i, surv_func in enumerate(pred_surv):
    plt.step(time_points, surv_func(time_points), where="post",
             label="Sample %d" % (i + 1))
plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")
plt.legend(loc="best")


from sksurv.metrics import concordance_index_censored

prediction = estimator.predict(data_x_numeric)
result = concordance_index_censored(data_y["Status"], data_y["Survival_in_days"], prediction)
result[0]

estimator.score(data_x_numeric, data_y)


# Feature selection
import numpy as np

def fit_and_score_features(X, y):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis()
    for j in range(n_features):
        Xj = X[:, j:j+1]
Example #5
0
                        for f_name in selected_features[n_split]
                    ])
                    x = np.swapaxes(x, 0, 1)  # (n_samples, n_features)

                    x_train, x_test = x[split['train']], x[split['test']]
                    y_train, y_test = survial_data[
                        split['train']], survial_data[split['test']]

                    # Model
                    predictor = CoxPHSurvivalAnalysis(alpha=0, n_iter=1e9)

                    try:
                        predictor.fit(x_train, y_train[['event', 'time']])
                        c_indexes.append(
                            predictor.score(x_test, y_test[['event', 'time']]))
                        risk_score_train = predictor.predict(x_train)
                        risk_score = predictor.predict(x_test)
                        high_risk_masks.append(
                            risk_score > np.median(risk_score_train))
                        y_tests.append(y_test)
                    except Exception as e:
                        logger.warning("Error {}".format(str(e)))
                        c_indexes.append(np.NaN)

                # ----------------------- Kaplan-Meier --------------------------------
                high_risk_mask = np.concatenate(high_risk_masks)

                y_tests = np.concatenate(y_tests)
                y_high_risk, y_low_risk = y_tests[high_risk_mask], y_tests[
                    ~high_risk_mask]
Example #6
0
    'regressor__model__min_child_weight': (10, 500, 'log-uniform'),  # categorical parameter
    'regressor__model__n_estimators': (1, 8),  # integer valued parameter
    'regressor__model__reg_alpha': (1, 8, 'log-uniform'),  # integer valued parameter
    'regressor__model__reg_lambda': (1, 8, 'log-uniform'),  # integer valued parameter
    'regressor__model__subsample': (1, 8, 'log-uniform'),  # integer valued parameter

}
#%%
# Since sksurv output log hazard ratios (here relative to 0 on predictors)
# we must use 'output_margin=True' for comparability.
estimator = CoxPHSurvivalAnalysis().fit(data_x, data_y)
gbm = xgb.XGBRegressor(objective='survival:cox',
                       booster='gblinear',
                       base_score=1,
                       n_estimators=1000)

search = BayesSearchCV(gbm, params, n_iter=3, cv=3)
search.fit(data_x, data_y_xgb)

#%%
prediction_sksurv = estimator.predict(data_x)
predictions_xgb = search.predict(data_x)
d = pd.DataFrame({'xgb': predictions_xgb,
                  'sksurv': prediction_sksurv})
d.head()

# %%
context.io.save('xente_xgb', gbm)

# %%