def fit_and_prepare(x_train, y_train, test_df):

    # 3.1. Prepare Y-----
    y_train.specific_death = y_train.specific_death.astype(bool)

    # Transform it into a structured array
    y_train = y_train.to_records(index=False)

    # 3.2. Prepare X-----
    # obtain the x variables that are categorical
    categorical_feature_mask = x_train.dtypes == object

    # Filter categorical columns using mask and turn it into a list
    categorical_cols = x_train.columns[categorical_feature_mask].tolist()

    # Ensure categorical columns are category type
    for col in categorical_cols:
        x_train[col] = x_train[col].astype('category')
        test_df[col] = test_df[col].astype('category')

    # 3.3. Fit model-----
    # initiate
    encoder = OneHotEncoder()
    estimator = CoxPHSurvivalAnalysis()

    # fit model
    estimator.fit(encoder.fit_transform(x_train), y_train)

    # transform the test variables to match the train
    x_test = encoder.transform(test_df)

    return (estimator, x_test, x_train, y_train)
Example #2
0
def train_coxph(data_df, r_splits):
  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])

    (data_x, data_y), (val_x, val_y), (test_x, test_y), (test_30_x, test_30_y) = df2array(data_df, df_train, df_val, df_test, df_test_30)

    estimator = CoxPHSurvivalAnalysis(alpha=1e-04)
    estimator.fit(data_x, data_y)

    c_index_at.append(estimator.score(test_x, test_y))
    c_index_30.append(estimator.score(test_30_x, test_30_y))

    for time_x in [30, 60, 365]:
      t_auc, t_mean_auc = cumulative_dynamic_auc(data_y, test_y, estimator.predict(test_x), time_x)
      eval("time_auc_" + str(time_x)).append(t_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i]) 

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
def fit_and_score_features(X, y):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis()
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores
Example #4
0
    def mpss_ph_sksurv(self):
        """

        Performs proportional hazards regression using sksurv package.

        :return: Feature importance
        """
        # Reformat for sksurv package
        x_train = pd.DataFrame(self.x_train)
        y_structured = [
            (ll, s) for ll, s in zip(self.y_train.astype(bool), self.scores)
        ]
        y_structured = np.array(y_structured,
                                dtype=[('class', 'bool_'),
                                       ('score', 'single')])

        # Remove any feature columns that are all 0 values, otherwise cannot run regression
        x_train_nonzero = x_train.loc[:, (x_train != 0).any(axis=0)]

        # Run proportional hazards regression
        estimator = CoxPHSurvivalAnalysis(alpha=0.1, verbose=1)
        estimator.fit(x_train_nonzero, y_structured)
        prediction = estimator.predict(x_train_nonzero)

        # Estimate p-values for each feature
        f, pvals = f_regression(x_train_nonzero, [x[1] for x in y_structured])
        approximate_se = pd.DataFrame(pd.Series(
            pvals, index=x_train_nonzero.columns).sort_values(ascending=False),
                                      columns=['p']).reset_index()

        # Calculate concordance indicating the goodness of fit
        concordance = concordance_index_censored(self.y_train.astype(bool),
                                                 self.scores, prediction)
        print('concordance', concordance[0])

        # Dataframe with coefficients, absolute value of coefficients, and p-values
        importance = pd.DataFrame(estimator.coef_, columns=['coef'])
        importance['coef_abs'] = [math.fabs(c) for c in importance['coef']]
        importance['feature'] = importance.index.values
        importance = importance.merge(approximate_se,
                                      left_on='feature',
                                      right_on='index').drop('index', axis=1)

        # Sort feature importance
        importance = importance.sort_values(
            'coef_abs', ascending=False).reset_index(drop=True)
        return importance
Example #5
0
def cox(name):

    filename = filename_dict[name]
    raw_data = pd.read_csv(os.path.join(DATA_DIR, filename))
    formatted_x, formatted_y = sksurv_data_formatting(raw_data)

    x_train, x_test, y_train, y_test = train_test_split(
        formatted_x, formatted_y, test_size=0.25, random_state=RANDOM_STATE)

    estimator = CoxPHSurvivalAnalysis()
    estimator.fit(x_train, y_train)

    prediction = estimator.predict(x_test)
    result = concordance_index_censored(y_test["Status"],
                                        y_test["Survival_in_days"], prediction)

    return result[0]
Example #6
0
trainData = trainData[:, lowerbound:upperbound]

data_y = trainData[:, :2]
data_x = trainData[:, 2:]
x, y = data_x.shape
data_x += 0.001 * np.random.random((x, y))
gf_day = list(trainData[:, 0])
gf_1year_label = list(trainData[:, 1])
gf_1year_label = list(map(lambda x: x == 1, gf_1year_label))
dt = np.dtype('bool,float')
data_y = [(gf_1year_label[i], gf_day[i]) for i in range(len(gf_1year_label))]
data_y = np.array(data_y, dtype=dt)

t1 = time()
estimator = CoxPHSurvivalAnalysis()
estimator.fit(data_x[:train_num], data_y[:train_num])
print('fitting estimate cost {} seconds'.format(int(time() - t1)))
print(estimator.score(data_x[train_num:], data_y[train_num:]))
'''
data_x, data_y = load_veterans_lung_cancer()

#pd.DataFrame.from_records(data_y[[11, 5, 32, 13, 23]], index=range(1, 6))


time, survival_prob = kaplan_meier_estimator(data_y["Status"], data_y["Survival_in_days"])
plt.step(time, survival_prob, where="post")
plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")

print(data_x["Treatment"].value_counts())
Example #7
0
#%%
fig, ax = viz.plot_c_index(c_index)
fig.savefig(PATH_RESULTS / ('c-index.pdf'), dpi=300, bbox_inches='tight')

# %%
# Explainability using SHAP.
# Due to the long computational time required for kernel SHAP, we will
# only compare the reference case (CPH) and the best-performing ML model (XGB).
#
# First, we need to split our data and fit the models (using their
# best parameters).

#%%
X_ss_train, X_ss_test, y_ss_train, y_ss_test = train_test_split(
    X_ss, y_ss, test_size=1 / n_splits, random_state=SEED)
cph.fit(X_ss_train, y_ss_train)

#%%
X_xgb_train, X_xgb_test, y_xgb_train, y_xgb_test = train_test_split(
    X_xgb, y_xgb, test_size=1 / n_splits, random_state=SEED)
xgb.set_params(**best_params_xgb)
xgb.fit(X_xgb_train, y_xgb_train)

# %%
# Then, we compute the SHAP values.
#
# In the case of CPH (i.e., when using SHAP's Kernel Explainer),
# this can be VERY slow. Be careful! Therefore, we will compute it just once, save it,
# and load it from memory.

#%%
    plt.step(time_cell, survival_prob_cell, where="post",
             label="%s (n = %d)" % (value, mask.sum()))

plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")
plt.legend(loc="best")

from sksurv.preprocessing import OneHotEncoder

data_x_numeric = OneHotEncoder().fit_transform(data_x)
data_x_numeric.head()

from sksurv.linear_model import CoxPHSurvivalAnalysis

estimator = CoxPHSurvivalAnalysis()
estimator.fit(data_x_numeric, data_y)

pd.Series(estimator.coef_, index=data_x_numeric.columns)

x_new = pd.DataFrame.from_dict({
    1: [65, 0, 0, 1, 60, 1, 0, 1],
    2: [65, 0, 0, 1, 60, 1, 0, 0],
    3: [65, 0, 1, 0, 60, 1, 0, 0],
    4: [65, 0, 1, 0, 60, 1, 0, 1]},
     columns=data_x_numeric.columns, orient='index')
x_new

import numpy as np

pred_surv = estimator.predict_survival_function(x_new)
time_points = np.arange(1, 1000)
Example #9
0
                    x = np.asarray([
                        feat[anno][f_name][:, rater]
                        for f_name in selected_features[n_split]
                    ])
                    x = np.swapaxes(x, 0, 1)  # (n_samples, n_features)

                    x_train, x_test = x[split['train']], x[split['test']]
                    y_train, y_test = survial_data[
                        split['train']], survial_data[split['test']]

                    # Model
                    predictor = CoxPHSurvivalAnalysis(alpha=0, n_iter=1e9)

                    try:
                        predictor.fit(x_train, y_train[['event', 'time']])
                        c_indexes.append(
                            predictor.score(x_test, y_test[['event', 'time']]))
                        risk_score_train = predictor.predict(x_train)
                        risk_score = predictor.predict(x_test)
                        high_risk_masks.append(
                            risk_score > np.median(risk_score_train))
                        y_tests.append(y_test)
                    except Exception as e:
                        logger.warning("Error {}".format(str(e)))
                        c_indexes.append(np.NaN)

                # ----------------------- Kaplan-Meier --------------------------------
                high_risk_mask = np.concatenate(high_risk_masks)

                y_tests = np.concatenate(y_tests)
Example #10
0
#data_x_1['date_hour'] = data_x_1['date'].dt.hour
#data_x_1['date_minute'] = data_x_1['date'].dt.minute
#data_x_1['date_second'] = data_x_1['date'].dt.second
#data_x_1.drop(columns=['date','created', 'install_diff','device_brand','install_seconds','user_agent'], inplace=True)
#data_x_1_numeric = pd.get_dummies(data_x_1, dummy_na=True, prefix_sep='=')

#%%
data_y_1 = np.fromiter(zip(
    data_full_1.head(100)["status_censored"],
    data_full_1.head(100)["in_seconds"]),
                       dtype=[('status_censored', np.bool),
                              ('in_seconds', np.float64)])

#%%
estimator = CoxPHSurvivalAnalysis(alpha=0.1)
estimator.fit(data_x_1_numeric.head(100), data_y_1)
estimator.score(data_x_1_numeric.head(100), data_y_1)


#%%
def fit_and_score_features(X, y, alpha=0.1):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis(alpha=alpha)
    for j in range(n_features):
        Xj = X[:, j:j + 1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores

Example #11
0
for i in range(0, len(_event)):
    x, y = kaplan_meier_estimator(_event[i], _time[i])
    plt.step(x, y, where="post", label=str(i))

plt.legend()

plt.plot()

_train_l = numpy.array(list(_train_l), dtype='bool,f4')

_test_l = numpy.array(list(_test_l), dtype='bool,f4')

# create ph model
estimator = CoxPHSurvivalAnalysis()

estimator.fit(_train_d, _train_l)

# create the cox model
clf = CoxnetSurvivalAnalysis(n_alphas=5, tol=0.1)

# train model
clf.fit(_train_d, _train_l)

result = []
# evaluate for every alpha
for v in clf.alphas_:
    res = clf.predict(_test_d, alpha=[v])
    result.append(concordance_index_censored(tft, timet, res))

# calculate precision
clf.predict(_test_d)
Example #12
0
# Import libraries for feature scaling and selection, fitting and  evaluation of the survival model
import sksurv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler()
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

# Load flux data as the X variable
X = pd.read_csv('fluxes.csv')
# Load survival data as the Y variable
Y = pd.read_csv('survival_data.csv')

# Define the training and test sets and specify the proportion of data to be used as the test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

# Perform feature scaling on the flux data
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Fit the training data to the Cox proportional hazards model
estimator = CoxPHSurvivalAnalysis()
estimator.fit(X_train, Y_train)

# Evaluate the fit of the survival model using Harrell`s concordance index
result = concordance_index_censored(Y_test['Event'], Y_test['Time'], prediction)