def test_uno_auc_times_failure(uno_auc_times_failure_data):
    y_train, y_test, times, match = uno_auc_times_failure_data

    estimate = numpy.random.randn(y_test.shape[0])
    with pytest.raises(ValueError,
                       match=match):
        cumulative_dynamic_auc(y_train, y_test, estimate, times)
Exemple #2
0
def train_coxph(data_df, r_splits):
  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])

    (data_x, data_y), (val_x, val_y), (test_x, test_y), (test_30_x, test_30_y) = df2array(data_df, df_train, df_val, df_test, df_test_30)

    estimator = CoxPHSurvivalAnalysis(alpha=1e-04)
    estimator.fit(data_x, data_y)

    c_index_at.append(estimator.score(test_x, test_y))
    c_index_30.append(estimator.score(test_30_x, test_30_y))

    for time_x in [30, 60, 365]:
      t_auc, t_mean_auc = cumulative_dynamic_auc(data_y, test_y, estimator.predict(test_x), time_x)
      eval("time_auc_" + str(time_x)).append(t_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i]) 

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
Exemple #3
0
def evaluate_performance(T_train, c_train, T_test, c_test, prediction, time_horizon, 
                         num_causes=2, cause_names=["Cause 1", "Cause 2"]):

    Harell_c_index     = []
    UNO_c_index        = []
    dynamic_auc        = []

    for _ in range(num_causes):

        y_train = np.array([((c_train.loc[c_train.index[k]]== _ + 1), T_train.loc[T_train.index[k]]) for k in range(len(T_train))], dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
        y_test  = np.array([((c_test.loc[c_test.index[k]]== _ + 1), T_test.loc[T_test.index[k]]) for k in range(len(T_test))], dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

        Harell_c_index.append(concordance_index(T_test, prediction[_ + 1], event_observed=(c_test==(_+1))*1))
        tau = max(y_train['Survival_in_days'])
        ci_tau = concordance_index_ipcw(y_train, y_test, 1 - prediction[_ + 1], tau=tau)[0]

        UNO_c_index.append(ci_tau)
        try:
            dynamic_auc_val = cumulative_dynamic_auc(y_train, y_test, 1 - prediction[_ + 1], times=[time_horizon])[0][0]
        except ValueError:
            print('*warning: exception while calculating dynamic_auc, dynamic_auc is not calculated*')
            dynamic_auc_val = "-"
        dynamic_auc.append(dynamic_auc_val)
        print("--- Cause: {} -> [C-index: {:0.4f} ] [Dynamic AUC-ROC: {} ]".format(
            cause_names[_],
            UNO_c_index[-1],
            '{:0.4f}'.format(dynamic_auc[-1]) if dynamic_auc[-1] != "-"  else "-"))
def dynamic_auc(y_train, y_test, y_pred, year=3):
    """Dynamic or Time-Dependent AUC
     is the average of how often a model says X is greater than Y when,
     in the observed data, X is indeed greater than Y
    https://lifelines.readthedocs.io/en/latest/lifelines.utils.html#lifelines.utils.concordance_index

    Parameters
    ----------
    y_true :  pandas.DataFrame
        DataFrame with annotation of samples. Two columns are mandatory:
        Event (binary labels), Time to event (float time to event).
    y_test :  pandas.DataFrame
        DataFrame with annotation of samples. Two columns are mandatory:
        Event (binary labels), Time to event (float time to event).
    y_pred : array-like
        List of predicted risk scores.
    year: float
        Timepoint at which to calculate the AUC score
    Returns
    -------
    float [0, 1]
        dynamic auc for specified year
    """
    structured_y_train = structure_y_to_sksurv(y_train)
    structured_y_test = structure_y_to_sksurv(y_test)

    return cumulative_dynamic_auc(
        structured_y_train,
        structured_y_test,
        y_pred,
        [year],
    )[0][0]
        def functionToOptimize(**params):

            self.counter += 1
            print(f"Bayesian Optimization model: {2 ** params['alphas']}; time: {self.counter}")



            model = CoxnetSurvivalAnalysis(l1_ratio = 1.0, max_iter = 1000000)
            params["alphas"] = [2 ** params["alphas"]]



            model.set_params(**params)

            cvAucMeans = []
            for trainIndex, testIndex in KFold(n_splits = 4).split(self.data.values):

                trainX, trainY = self.data.values[trainIndex,], self.data.tags[trainIndex[:, None],]
                testX, testY = self.data.values[testIndex,:], self.data.tags[testIndex[:, None],]

                trainY = np.reshape(trainY, -1)
                testY = np.reshape(testY, -1)

                model.fit(trainX, trainY)

                times = np.percentile(testY["Time_in_days"], np.linspace(5, 81, 15))
                _, meanAuc = cumulative_dynamic_auc(testY, testY,
                                                      model.predict(testX),
                                                      times)
                cvAucMeans.append(meanAuc)

            return -np.mean(cvAucMeans)
Exemple #6
0
def test_uno_auc_time_dependent_without_censoring(
        uno_auc_time_dependent_without_censoring_data):
    y, times, estimate, expected, iauc = uno_auc_time_dependent_without_censoring_data

    auc, iauc = cumulative_dynamic_auc(y, y, estimate, times)
    assert_array_almost_equal(auc, expected)
    assert round(abs(iauc - iauc), 6) == 0
def output_simulations(surv, df_train, x_test, df_test, name):
    """ Compute the output of the model on the test set
    # Arguments
        model: neural network model trained with final parameters.
        df_train: training dataset
        x_test: 20 simulated input variables
        df_test: test dataset
        name: name of the model
    # Returns
        results_test: AUC and Uno C-index at median survival time
    """

    data_train = skSurv.from_arrays(event=df_train['status'],
                                    time=df_train['yy'])
    data_test = skSurv.from_arrays(event=df_test['status'], time=df_test['yy'])
    cens_test = 100. - df_test['status'].sum(
    ) * 100. / df_test['status'].shape[0]

    time_med = np.percentile(data_test['time'], np.linspace(0, 50, 2))
    auc_med = float(
        cumulative_dynamic_auc(data_train, data_test,
                               -determine_surv_prob(surv, time_med[1]),
                               time_med[1])[0])
    unoc = float(
        concordance_index_ipcw(data_train, data_test,
                               -determine_surv_prob(surv, time_med[1]),
                               time_med[1])[0])

    results_test = pd.DataFrame({
        't_med': time_med[1],
        'auc_med': [auc_med],
        'unoc': [unoc],
        'cens_rate': [cens_test]
    })
    return results_test
def plot_cumulative_dynamic_auc(risk_score, label, color=None):
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)

    plt.plot(times, auc, marker="o", color=color, label=label)
    plt.xlabel("days from enrollment")
    plt.ylabel("time-dependent AUC")
    plt.axhline(mean_auc, color=color, linestyle="--")
    plt.legend()
    def rocInfo(self):
        self.upgradeInfo("Calculating model ROC curves")

        auc, mean, times = dict(), dict(), dict()

        from sksurv.metrics import cumulative_dynamic_auc

        for datasetName, dataset in self.model.dataset.items():

            serial = list(set(dataset["tags"]["Time_in_days"]))
            serial.sort()
            times[datasetName] = serial[1:-1]

            auc[datasetName], mean[datasetName] = cumulative_dynamic_auc(dataset["tags"],
                                                                         dataset["tags"],
                                                                         self.model.model.predict(dataset["values"]),
                                                                         times[datasetName])

        return auc, mean, times
Exemple #10
0
def cox_dynamic_auc(df, ann, n, year=3):
    """Select n features with the highest time-dependent auc on one-factor Cox regression.

    Parameters
    ----------
    df : pandas.DataFrame
        A pandas DataFrame whose rows represent samples
        and columns represent features.
    ann : pandas.DataFrame
        DataFrame with annotation of samples. Three columns are mandatory:
        Class (binary labels), Dataset (dataset identifiers) and
        Dataset type (Training, Filtration, Validation).
    n : int
        Number of features to select.
    year: float
        Timepoint for which to calculate AUC score
    Returns
    -------
    list
        List of n features associated with the highest auc.
    """
    ann = ann[['Event', 'Time to event']]

    structured_y = structure_y_to_sksurv(ann)
    columns = df.columns

    scores = []
    for j, column in enumerate(columns):
        df_j = df[[column]]
        model = CoxRegression()
        model.fit(df_j, ann)
        preds = model.predict(df_j)
        auc, _ = cumulative_dynamic_auc(structured_y, structured_y, preds,
                                        [year])
        score = auc[0]

        scores.append(score)

    scores, features = zip(
        *sorted(zip(scores, columns), key=lambda x: x[0], reverse=True))

    return features[:n]
def plot_cumulative_dynamic_auc(risk_score, label, color=None):
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)

    plt.plot(times, auc, marker="o", color=color, label=label)
    plt.xlabel("days from enrollment")
    plt.ylabel("time-dependent AUC")
    plt.axhline(mean_auc, color=color, linestyle="--")
    plt.legend()


for i, col in enumerate(num_columns):
    plot_cumulative_dynamic_auc(x_test[:, i], col, color="C{}".format(i))
    ret = concordance_index_ipcw(y_train, y_test, x_test[:, i], tau=times[-1])

from sksurv.datasets import load_veterans_lung_cancer

va_x, va_y = load_veterans_lung_cancer()

cph = make_pipeline(OneHotEncoder(), CoxPHSurvivalAnalysis())
cph.fit(va_x, va_y)

va_times = np.arange(7, 183, 7)
# estimate performance on training data, thus use `va_y` twice.
va_auc, va_mean_auc = cumulative_dynamic_auc(va_y, va_y, cph.predict(va_x),
                                             va_times)

plt.plot(va_times, va_auc, marker="o")
plt.axhline(va_mean_auc, linestyle="--")
plt.xlabel("days from enrollment")
plt.ylabel("time-dependent AUC")
plt.grid(True)
Exemple #12
0
def train_deepsurv(data_df, r_splits):
  epochs = 100
  verbose = True

  num_nodes = [32]
  out_features = 1
  batch_norm = True
  dropout = 0.6
  output_bias = False

  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])
    
    xcols = list(df_train.columns)

    for col_name in ["subject_id", "event", "duration"]:
      if col_name in xcols:
        xcols.remove(col_name)

    cols_standardize = xcols

    standardize = [([col], StandardScaler()) for col in cols_standardize]

    x_mapper = DataFrameMapper(standardize)

    x_train = x_mapper.fit_transform(df_train).astype('float32')
    x_val = x_mapper.transform(df_val).astype('float32')
    x_test = x_mapper.transform(df_test).astype('float32')
    x_test_30 =  x_mapper.transform(df_test_30).astype('float32')

    labtrans = CoxTime.label_transform()
    get_target = lambda df: (df['duration'].values, df['event'].values)
    y_train = labtrans.fit_transform(*get_target(df_train))
    y_val = labtrans.transform(*get_target(df_val))

    durations_test, events_test = get_target(df_test)
    durations_test_30, events_test_30 = get_target(df_test_30)
    val = tt.tuplefy(x_val, y_val)

    (train_x, train_y), (val_x, val_y), (test_x, test_y), _ = df2array(data_df, df_train, df_val, df_test, df_test_30)

    #MODEL
    in_features = x_train.shape[1]

    callbacks = [tt.callbacks.EarlyStopping()]

    net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm, dropout, output_bias=output_bias)

    model = CoxPH(net, tt.optim.Adam)
    model.optimizer.set_lr(0.0001)

    if x_train.shape[0] % 2:
      batch_size = 255
    else:
      batch_size = 256

    log = model.fit(x_train, y_train, batch_size, epochs, callbacks, val_data=val, val_batch_size=batch_size)

    model.compute_baseline_hazards()

    surv = model.predict_surv_df(x_test)
    ev = EvalSurv(surv, durations_test, events_test, censor_surv='km')
    c_index_at.append(ev.concordance_td())

    surv_30 = model.predict_surv_df(x_test_30)
    ev_30 = EvalSurv(surv_30, durations_test_30, events_test_30, censor_surv='km')
    c_index_30.append(ev_30.concordance_td())

    for time_x in [30, 60, 365]:
      va_auc, va_mean_auc = cumulative_dynamic_auc(train_y, test_y, model.predict(x_test).flatten(), time_x)

      eval("time_auc_" + str(time_x)).append(va_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i])

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
def test_uno_auc_whas500(uno_auc_whas500_data):
    y_train, y_test, estimate, times, expect_auc, expect_iauc = uno_auc_whas500_data

    auc, iauc = cumulative_dynamic_auc(y_train, y_test, estimate, times)
    assert_array_almost_equal(auc, expect_auc)
    assert_almost_equal(iauc, expect_iauc)
def test_uno_auc_censoring_failure(uno_auc_censoring_failure_data):
    y_train, y_test, times, estimate, match = uno_auc_censoring_failure_data

    with pytest.raises(ValueError,
                       match=match):
        cumulative_dynamic_auc(y_train, y_test, estimate, times)
Exemple #15
0
print(f"weighted brier score: {weighted_br_score: .4f}")

#%% Time-dependent Area under the ROC
survival_train=np.dtype([('event',data_event_train.dtype),('surv_time',data_time_train.dtype)])
survival_train=np.empty(len(data_event_train),dtype=survival_train)
survival_train['event']=data_event_train
survival_train['surv_time']=data_time_train

survival_test=np.dtype([('event',data_event_test.dtype),('surv_time',data_time_test.dtype)])
survival_test=np.empty(len(data_event_test),dtype=survival_test)
survival_test['event']=data_event_test
survival_test['surv_time']=data_time_test

event_times = np.arange(np.min(data_time_test), np.max(data_time_test)/2, 75)

test_auc, test_mean_auc = cumulative_dynamic_auc(survival_train, survival_test, model_prediction, event_times)

print(f"Time-dependent Area under the ROC: {test_mean_auc: .4f}")

plt.plot(event_times, test_auc, marker="o")
plt.axhline(test_mean_auc, linestyle="--")
plt.xlabel("Days from Enrollment")
plt.ylabel("Time-dependent Area under the ROC")
plt.grid(True)
plt.savefig('typical_auc_batch.png',dpi = 600)

#%% log -partial likelihood
log_lik = log_partial_lik(model_prediction.reshape(-1,1), data_event_test.reshape(-1,1))
print(f"Log partial likelihood: {log_lik: .4f}")

#%% individual fairness measures
Exemple #16
0
def train_LSTMCox(data_df, r_splits):
  epochs = 100
  verbose = True

  in_features = 768
  out_features = 1
  batch_norm = True
  dropout = 0.6
  output_bias = False

  c_index_at = []
  c_index_30 = []

  time_auc_30 = []
  time_auc_60 = []
  time_auc_365 = []

  for i in range(len(r_splits)):
    print("\nIteration %s"%(i))
    
    #DATA PREP
    df_train, df_val, df_test, df_test_30 = prepare_datasets(data_df, r_splits[i][2], r_splits[i][1], r_splits[i][0])

    x_train = np.array(df_train["x0"].tolist()).astype("float32")
    x_val = np.array(df_val["x0"].tolist()).astype("float32")
    x_test = np.array(df_test["x0"].tolist()).astype("float32")
    x_test_30 = np.array(df_test_30["x0"].tolist()).astype("float32")

    labtrans = CoxTime.label_transform()
    get_target = lambda df: (df['duration'].values, df['event'].values)
    y_train = labtrans.fit_transform(*get_target(df_train))
    y_val = labtrans.transform(*get_target(df_val))

    durations_test, events_test = get_target(df_test)
    durations_test_30, events_test_30 = get_target(df_test_30)
    val = tt.tuplefy(x_val, y_val)
    
    (train_x, train_y), (val_x, val_y), (test_x, test_y), _ = df2array(data_df, df_train, df_val, df_test, df_test_30)

    #MODEL
    callbacks = [tt.callbacks.EarlyStopping()]

    net = LSTMCox(768, 32, 1, 1)

    model = CoxPH(net, tt.optim.Adam)
    model.optimizer.set_lr(0.0001)

    if x_train.shape[0] % 2:
      batch_size = 255
    else:
      batch_size = 256
      
    log = model.fit(x_train, y_train, batch_size, epochs, callbacks, val_data=val, val_batch_size=batch_size)

    model.compute_baseline_hazards()

    surv = model.predict_surv_df(x_test)
    ev = EvalSurv(surv, durations_test, events_test, censor_surv='km')
    c_index_at.append(ev.concordance_td())

    surv_30 = model.predict_surv_df(x_test_30)
    ev_30 = EvalSurv(surv_30, durations_test_30, events_test_30, censor_surv='km')
    c_index_30.append(ev_30.concordance_td())

    for time_x in [30, 60, 365]:
      va_auc, va_mean_auc = cumulative_dynamic_auc(train_y, test_y, model.predict(x_test).flatten(), time_x)

      eval("time_auc_" + str(time_x)).append(va_auc[0])

    print("C-index_30:", c_index_30[i])
    print("C-index_AT:", c_index_at[i])

    print("time_auc_30", time_auc_30[i])
    print("time_auc_60", time_auc_60[i])
    print("time_auc_365", time_auc_365[i])

  return c_index_at, c_index_30, time_auc_30, time_auc_60, time_auc_365
Exemple #17
0
cph = CoxPHSurvivalAnalysis()
sss = StratifiedShuffleSplit(n_splits=N, test_size=nTest)

for i, x in enumerate(tx):
    x = x.to_numpy()
    j = 0
    for train, test in sss.split(x, y['bio_rec_6']):
        xtrain, ytrain = x[train], y[train]
        xtest, ytest = x[test], y[test]
        cph.fit(xtrain, ytrain)
        ypred = cph.predict(xtrain)
        time = np.linspace(
            np.min(ytrain['bio_rec_6_delay']) + 1,
            np.max(ytrain['bio_rec_6_delay']) - 1, 3)
        auc[i, :], meanAuc[j,
                           i] = cumulative_dynamic_auc(ytrain, ytrain, ypred,
                                                       time)
        j += 1

meanAuc = pd.DataFrame(data=meanAuc,
                       columns=[
                           '4_feat.', "Prediction 1", 'Prediction 2',
                           'Prediction 3', 'Prediction 4'
                       ])
meanMeanAuc = meanAuc.mean(axis=0)
stdMeanAuc = meanAuc.std(axis=0)

#%%
N = 100
#x = data
#x = x.drop(columns = ['bio_rec_6', 'bio_rec_6_delay'])
tx = [
def output_bootstrap(model, n_iterations, df_train, data_train, y_train,
                     df_test, name):
    """ Compute the output of the model on the bootstraped test set
    # Arguments
        model: neural network model trained with final parameters.
        n_iterations: number of bootstrap iterations
        df_train: training dataset
        data_train: two columns dataset with survival time and censoring status for training samples
        y_train: survival time
        df_test: test dataset
        name: name of the model
    # Returns
        results_all: AUC and Uno C-index at 5 and 10 years 
    """
    if name == "CoxTime" or name == "Cox-CC":
        _ = model.compute_baseline_hazards()
    results_all = pd.DataFrame(columns=['auc5', 'auc10', 'unoc5', 'unoc10'])
    results_final = pd.DataFrame(
        columns=['mean', 'ci95_lo', 'ci95_hi', 'std', 'count'])

    for i in range(n_iterations):
        print(i)
        test_boot = resample(df_test, n_samples=len(df_test), replace=True)
        x_test_boot = test_boot.drop(['surv_test', 'cen_test'], axis=1)
        duration_test_b, event_test_b = test_boot[
            'surv_test'].values, test_boot['cen_test'].values
        data_test_b = skSurv.from_arrays(event=event_test_b,
                                         time=duration_test_b)
        if name == "Cox-CC" or name == "CoxTime" or name == "DeepHit":
            surv = model.predict_surv_df(np.array(x_test_boot,
                                                  dtype='float32'))
        else:
            n_picktime = int(y_train[['s']].apply(pd.Series.nunique))
            x_test_boot_all = pd.concat([x_test_boot] * n_picktime)
            time_test = pd.DataFrame(
                np.repeat(np.unique(y_train[['s']]), len(x_test_boot)))
            x_test_boot_all.reset_index(inplace=True, drop=True)
            x_test_boot_all = pd.concat([x_test_boot_all, time_test], axis=1)
            surv = make_predictions_pseudobs(model, y_train, x_test_boot_all,
                                             x_test_boot, name)

        time_grid = np.linspace(duration_test_b.min(), duration_test_b.max(),
                                100)
        prob_5_10 = pd.concat([
            determine_surv_prob(surv, i)
            for i in (duration_test_b.min(), 5, 10)
        ],
                              axis=1)
        auc5 = float(
            cumulative_dynamic_auc(data_train, data_test_b,
                                   -prob_5_10.iloc[:, 1], 5)[0])
        auc10 = float(
            cumulative_dynamic_auc(data_train, data_test_b,
                                   -prob_5_10.iloc[:, 2], 10)[0])
        unoc5 = float(
            concordance_index_ipcw(data_train, data_test_b,
                                   -prob_5_10.iloc[:, 1], 5)[0])
        unoc10 = float(
            concordance_index_ipcw(data_train, data_test_b,
                                   -prob_5_10.iloc[:, 2], 10)[0])
        results = pd.DataFrame({
            'auc5': [auc5],
            'auc10': [auc10],
            'unoc5': [unoc5],
            'unoc10': [unoc10]
        })
        results_all = results_all.append(results,
                                         ignore_index=True,
                                         sort=False)

    for column in results_all:
        stats = results_all[column].agg(['mean', 'count', 'std'])
        scores = np.array(results_all[column])
        sorted_scores = np.sort(scores, axis=None)
        ci95_lo = sorted_scores[int(0.05 * len(sorted_scores))]
        ci95_hi = sorted_scores[int(0.95 * len(sorted_scores))]
        results_stat = pd.DataFrame({
            'mean': [stats[0]],
            'ci95_lo': ci95_lo,
            'ci95_hi': [ci95_hi],
            'std': [stats[2]],
            'count': [stats[1]]
        })
        results_final = results_final.append(results_stat,
                                             ignore_index=False,
                                             sort=False)
    results_final.index = results_all.columns.tolist()
    return results_final
Exemple #19
0
def test_uno_auc(uno_auc_data):
    y_train, y_test, estimate, times, expect_auc, expect_iauc = uno_auc_data

    auc, iauc = cumulative_dynamic_auc(y_train, y_test, estimate, times)
    assert_array_almost_equal(auc, expect_auc)
    assert round(abs(iauc - expect_iauc), 6) == 0