def test_from_array_event_value_wrong_5(surv_arrays): event, time = surv_arrays event = numpy.arange(event.shape[0]) with pytest.raises(ValueError, match="event indicator must be binary"): Surv.from_arrays(event, time)
def test_from_array_event_value_wrong_3(surv_arrays): event, time = surv_arrays event[event == 0] = 3 with pytest.raises(ValueError, match="non-boolean event indicator must contain 0 and 1 only"): Surv.from_arrays(event, time)
def test_from_array_event_value_wrong_4(surv_arrays): event, time = surv_arrays event[1] = 3 with pytest.raises(ValueError, match="event indicator must be binary"): Surv.from_arrays(event, time)
def output_simulations(surv, df_train, x_test, df_test, name): """ Compute the output of the model on the test set # Arguments model: neural network model trained with final parameters. df_train: training dataset x_test: 20 simulated input variables df_test: test dataset name: name of the model # Returns results_test: AUC and Uno C-index at median survival time """ data_train = skSurv.from_arrays(event=df_train['status'], time=df_train['yy']) data_test = skSurv.from_arrays(event=df_test['status'], time=df_test['yy']) cens_test = 100. - df_test['status'].sum( ) * 100. / df_test['status'].shape[0] time_med = np.percentile(data_test['time'], np.linspace(0, 50, 2)) auc_med = float( cumulative_dynamic_auc(data_train, data_test, -determine_surv_prob(surv, time_med[1]), time_med[1])[0]) unoc = float( concordance_index_ipcw(data_train, data_test, -determine_surv_prob(surv, time_med[1]), time_med[1])[0]) results_test = pd.DataFrame({ 't_med': time_med[1], 'auc_med': [auc_med], 'unoc': [unoc], 'cens_rate': [cens_test] }) return results_test
def output_sim_data(model, surv, X_train, df_train, X_test, df_test): """ Compute the output of the model on the test set # Arguments model: neural network model trained with final parameters. X_train : input variables of the training set df_train: training dataset X_val : input variables of the validation set df_val: validation dataset # Returns results_test: Uno C-index at median survival time and Integrated Brier Score """ time_grid = np.linspace(np.percentile(df_test['yy'], 10), np.percentile(df_test['yy'], 90), 100) median_time = np.percentile(df_test['yy'], 50) data_train = skSurv.from_arrays(event=df_train['status'], time=df_train['yy']) data_test = skSurv.from_arrays(event=df_test['status'], time=df_test['yy']) c_med = concordance_index_ipcw( data_train, data_test, np.array(-determine_surv_prob(surv, median_time)), median_time)[0] ev = EvalSurv(surv, np.array(df_test['yy']), np.array(df_test['status']), censor_surv='km') ibs = ev.integrated_brier_score(time_grid) res = pd.DataFrame([c_med, ibs]).T res.columns = ['c_median', 'ibs'] return res
def test_from_array_names_match(surv_arrays): event, time = surv_arrays with pytest.raises(ValueError, match="name_time must be different from name_event"): Surv.from_arrays(event, time, name_event='time_and_event', name_time='time_and_event')
def uno_c_failure_data(request): p = request.param if p == 'last_time_uncensored_1': y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 19), event=(False, True, False, True, False, False, False, True)) y_test = Surv.from_arrays( time=(1, 3, 5, 7, 12, 13, 20), event=(True, False, False, True, True, False, True)) estimate = (5, 8, 13, 11, 9, 7, 4) match = "time must be smaller than largest " \ "observed time point:" elif p == 'last_time_uncensored_2': y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 19), event=(False, True, False, True, False, False, False, True)) y_test = Surv.from_arrays( time=(1, 23, 5, 27, 12), event=(True, False, True, True, False)) estimate = (5, 13, 11, 9, 4) match = "time must be smaller than largest " \ "observed time point:" elif p == 'zero_prob_1': y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 19), event=(False, True, False, True, False, False, False, False)) y_test = Surv.from_arrays( time=(1, 3, 5, 7, 12, 13, 19), event=(True, False, False, True, True, False, True)) estimate = (5, 8, 13, 11, 9, 7, 4) match = "censoring survival function is zero " \ "at one or more time points" elif p == 'zero_prob_2': y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 18), event=(False, True, False, True, False, False, False, False)) y_test = Surv.from_arrays( time=(1, 3, 5, 7, 12, 13, 19), event=(True, False, False, True, True, False, True)) estimate = (5, 8, 13, 11, 9, 7, 4) match = "censoring survival function is zero " \ "at one or more time points" elif p == 'zero_prob_3': y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 18), event=(False, True, False, True, False, False, False, False)) y_test = Surv.from_arrays( time=(1, 3, 5, 19, 12, 13, 7), event=(True, False, False, True, True, False, True)) estimate = (5, 8, 13, 11, 9, 7, 4) match = "censoring survival function is zero " \ "at one or more time points" else: assert False yield y_train, y_test, estimate, match
def test_from_array_shape_mismatch(surv_arrays): event, time = surv_arrays msg = "Found input variables with inconsistent numbers of samples" with pytest.raises(ValueError, match=msg): Surv.from_arrays(event[1:], time) with pytest.raises(ValueError, match=msg): Surv.from_arrays(event, time[1:])
def test_from_dataframe_wrong_class(surv_data_frame): data = surv_data_frame with pytest.raises(TypeError, match=r"exepected pandas.DataFrame, but got <class 'dict'>"): Surv.from_dataframe('event', 'time', data.to_dict()) with pytest.raises(TypeError, match=r"exepected pandas.DataFrame, but got <class 'numpy.ndarray'>"): Surv.from_dataframe('event', 'time', data.values)
def test_uno_c_all_censored(): y_train = Surv.from_arrays( time=(2, 4, 6, 8, 10, 11, 15, 19), event=(True, True, True, True, True, True, True, True)) y_test = Surv.from_arrays( time=(1, 3, 5, 7, 12, 13, 20), event=(True, False, False, True, True, False, False)) estimate = (5, 8, 13, 11, 9, 7, 4) ret_uno = concordance_index_ipcw(y_train, y_test, estimate) ret_harrell = concordance_index_censored(y_test['event'], y_test['time'], estimate) assert ret_uno == ret_harrell
def uno_auc_data_20(): y_train = Surv.from_arrays( time=[77.6, 57.6, 66.6, 67.0, 31.5, 5.5, 67.4, 43.7, 31.7, 71.9, 81.1, 56.2, 88.1, 2.9, 62.0, 17.2, 88.0, 26.4, 93.5, 79.9], event=[1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] ) y_test = Surv.from_arrays( time=[10.88, 19.78, 40.92, 98.7, 70.19, 10.15, 28.95, 29.57, 17.9, 63.78, 36.22, 83.14, 13.69, 99.51, 3.19], event=[1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1]) estimate = [-1.019, -0.016, 0.132, 0.269, -0.777, -1.077, 0.894, -1.227, -0.417, 0.072, -1.275, -0.91, -0.825, -0.292, -0.045] return y_train, y_test, estimate
def generate_survival_data(n_samples, hazard_ratio, baseline_hazard, percentage_cens, rnd): X, time_event, actual_c = generate_marker(n_samples, hazard_ratio, baseline_hazard, rnd) def get_observed_time(x): rnd_cens = np.random.RandomState(0) # draw censoring times time_censor = rnd_cens.uniform(high=x, size=n_samples) event = time_event < time_censor time = np.where(event, time_event, time_censor) return event, time def censoring_amount(x): event, _ = get_observed_time(x) cens = 1.0 - event.sum() / event.shape[0] return (cens - percentage_cens)**2 # search for upper limit to obtain the desired censoring amount res = opt.minimize_scalar(censoring_amount, method="bounded", bounds=(0, time_event.max())) # compute observed time event, time = get_observed_time(res.x) # upper time limit such that the probability # of being censored is non-zero for `t > tau` tau = time[event].max() y = Surv.from_arrays(event=event, time=time) mask = time < tau X_test = X[mask] y_test = y[mask] return X_test, y_test, y, actual_c
def uno_auc_data_15(): y = Surv.from_arrays( time=[10.88, 19.78, 40.92, 98.7, 70.19, 10.15, 28.95, 29.57, 17.9, 63.78, 36.22, 83.14, 13.69, 99.51, 3.19], event=[1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1]) estimate = [-1.019, -0.016, 0.132, 0.269, -0.777, -1.077, 0.894, -1.227, -0.417, 0.072, -1.275, -0.91, -0.825, -0.292, -0.045] return y, estimate
def uno_auc_whas500_data(request, whas500_pred): p = request.param event, time, estimate = whas500_pred y_train = Surv.from_arrays(event=event[:300], time=time[:300]) y_test = Surv.from_arrays(event=event[300:], time=time[300:]) estimate = estimate[300:] if p == 'whas500_unordered_time': times = (1000, 600, 1400, 200, 400, 1200, 800, 1000, 200) elif p == 'whas500': times = (200, 400, 600, 800, 1000, 1200, 1400) else: assert False iauc = 0.8045058 expected = numpy.array([0.7720669, 0.7765915, 0.7962623, 0.8759295, 0.8759295, 0.8759513, 0.9147647]) yield y_train, y_test, estimate, times, expected, iauc
def test_simple(): y = Surv.from_arrays([True, False, False, True, False], [7., 8., 11., 11., 23.], name_event="D", name_time="Y") x = pandas.DataFrame({"F1": [1, 1, 1, 0, 0], "F2": [23, 43, 54, 75, 67], "F3": [120, 98, 78, 91, 79], "F4": [0.123, 0.541, 0.784, 0.846, 0.331]}) coxnet = CoxnetSurvivalAnalysis(l1_ratio=1.0) coxnet.fit(x.values, y) expected_alphas = numpy.array( [7.02666666666667, 6.40243696630484, 5.83366211207401, 5.31541564828386, 4.84320877198972, 4.41295145312887, 4.02091700863675, 3.66370982370111, 3.3382359405709, 3.04167626017436, 2.77146212443153, 2.52525306776672, 2.30091654511542, 2.09650946083909, 1.91026133856035, 1.74055898614351, 1.5859325229961, 1.44504264866632, 1.31666904246323, 1.19969979362274, 1.09312177046848, 0.996011845149902, 0.907528897950459, 0.826906531910992, 0.753446434665921, 0.686512329995589, 0.625524466706047, 0.569954597101554, 0.519321401555745, 0.473186319551291, 0.431149751078499, 0.392847595491192, 0.357948097841098, 0.326148975375191, 0.297174799307102, 0.270774609184727, 0.24671973919085, 0.22480183754923, 0.204831061881182, 0.186634434881721, 0.170054346072885, 0.154947186657187, 0.141182105646904, 0.128639876495421, 0.117211864413924, 0.106799085428826, 0.0973113490299429, 0.0886664769834391, 0.0807895915432809, 0.0736124668960205, 0.0670729382214382]) # FIXME assert_array_almost_equal(expected_alphas, coxnet.alphas_[:len(expected_alphas)]) coef = pandas.DataFrame(coxnet.coef_[:, :len(expected_alphas)], dtype=float) expected_coef = pandas.read_csv(SIMPLE_COEF_FILE, header=None, skiprows=1) assert_columns_almost_equal(coef, expected_coef)
def uno_auc_time_dependent_without_censoring_data(request): from sklearn.metrics import roc_auc_score p = request.param y = Surv.from_arrays( time=[7, 9, 11, 12, 13, 15, 28, 39, 41, 76], event=[True, True, True, True, True, True, True, True, True, True]) times = [10, 14, 40] if p == 'time_dependent_without_censoring': estimate = numpy.array([ [1, 6, 18, 56, 32, 3, 99, 7, 67, 541], [6, 9, 11, 5, 3, 12, 56, 56.1, 81, 77], [13, 11, 12, 76, 55, 134, 70, 78, 75, 99], ]) elif p == 'time_dependent_with_ties_without_censoring': estimate = numpy.array([ [1, 6, 7, 56, 32, 3, 99, 7, 79, 17], [3, 6, 11, 5, 17, 12, 17, 56.1, 81, 77], [13, 11, 12, 17, 17, 134, 70, 78, 13, 99], ]) else: assert False expected_auc = numpy.array( [roc_auc_score(y["time"] > t, e) for t, e in zip(times, estimate)]) km_delta = numpy.array([1 - 0.8, 0.8 - 0.5, 0.5 - 0.2]) expected_iauc = numpy.sum(km_delta * expected_auc) / 0.8 return y, times, -estimate.T, expected_auc, expected_iauc
def test_unknown_optimizer(self): x = numpy.zeros((100, 10)) y = Surv.from_arrays(numpy.ones(100, dtype=bool), numpy.arange(1, 101, dtype=float)) ssvm = FastSurvivalSVM(rank_ratio=0, optimizer='random stuff') self.assertRaisesRegex(ValueError, "unknown optimizer: random stuff", ssvm.fit, x, y)
def test_all_censored(self): x = numpy.arange(80).reshape(10, 8) y = Surv.from_arrays(numpy.zeros(10, dtype=bool), [0, 1, 2, 1, 1, 0, 1, 2, 3, 1]) rsvm = FastSurvivalSVM() self.assertRaisesRegex(ValueError, "all samples are censored", rsvm.fit, x, y)
def test_alpha_negative(self): x = numpy.zeros((100, 10)) y = Surv.from_arrays(numpy.ones(100, dtype=bool), numpy.arange(100, dtype=float)) ssvm = FastSurvivalSVM(alpha=-1) self.assertRaisesRegex(ValueError, "alpha must be positive", ssvm.fit, x, y)
def rossi(): """Load rossi.csv""" p = Path(__file__) f = p.parent / 'data' / 'rossi.csv' data = pandas.read_csv(f) y = Surv.from_dataframe("arrest", "week", data) x = data.drop(["arrest", "week"], axis=1) return DataSet(x=x, y=y)
def test_ranking_with_fit_intercept(): x = numpy.zeros((100, 10)) y = Surv.from_arrays(numpy.ones(100, dtype=bool), numpy.arange(1, 101, dtype=float)) ssvm = FastSurvivalSVM(rank_ratio=1.0, fit_intercept=True) with pytest.raises(ValueError, match="fit_intercept=True is only meaningful if rank_ratio < 1.0"): ssvm.fit(x, y)
def test_negative_time(): x = numpy.arange(80).reshape(10, 8) y = Surv.from_arrays([0, 1, 0, 1, 1, 0, 1, 0, 0, 1], [1, 1, -2, 1, 1, 6, 1, 2, 3, 1]) rsvm = FastSurvivalSVM(rank_ratio=0.5) with pytest.raises(ValueError, match="observed time contains values smaller or equal to zero"): rsvm.fit(x, y)
def toy_data(): x = numpy.array([[1., 1.], [10.2, 15.], [20., 5.], [40, 30], [45, 21], [50, 36]]) y = Surv.from_arrays([True, True, False, True, False, False], numpy.arange(1, 7) + 2**numpy.arange(1, 7), name_event='status') return x, y
def test_all_censored(): x = numpy.arange(80).reshape(10, 8) y = Surv.from_arrays(numpy.zeros(10, dtype=bool), [0, 1, 2, 1, 1, 0, 1, 2, 3, 1]) rsvm = FastSurvivalSVM() with pytest.raises(ValueError, match="all samples are censored"): rsvm.fit(x, y)
def test_from_array_with_one_name_1(surv_arrays): event, time = surv_arrays expected = numpy.empty(dtype=[('death', bool), ('time', float)], shape=100) expected['death'] = event.astype(bool) expected['time'] = time y = Surv.from_arrays(event.astype(bool), time, name_event='death') assert_array_equal(y, expected)
def test_from_array_with_one_name_2(surv_arrays): event, time = surv_arrays expected = numpy.empty(dtype=[('event', bool), ('survival_time', float)], shape=100) expected['event'] = event.astype(bool) expected['survival_time'] = time y = Surv.from_arrays(event.astype(bool), time, name_time='survival_time') assert_array_equal(y, expected)
def test_from_array_float(surv_arrays): event, time = surv_arrays expected = numpy.empty(dtype=[('event', bool), ('time', float)], shape=100) expected['event'] = event.astype(bool) expected['time'] = time y = Surv.from_arrays(event.astype(float), time) assert_array_equal(y, expected)
def test_from_dataframe_int(surv_data_frame): data = surv_data_frame expected = numpy.empty(dtype=[('event', bool), ('time', float)], shape=100) expected['event'] = data['event'].astype(bool) expected['time'] = data['time'] y = Surv.from_dataframe('event', 'time', data) assert_array_equal(y, expected)
def test_uno_c_not_1d(whas500_pred, dim): event, time, risk = whas500_pred y = Surv.from_arrays(event, time) risk = numpy.tile(risk[:, numpy.newaxis], (1, dim)) with pytest.raises(ValueError, match="Expected 1D array, got 2D array instead:"): concordance_index_ipcw(y, y, risk)
def test_from_dataframe_no_such_column(surv_data_frame): data = surv_data_frame data['event'] = data['event'].astype(bool) expected = numpy.empty(dtype=[('event', bool), ('time', float)], shape=100) expected['event'] = data['event'] expected['time'] = data['time'] with pytest.raises( KeyError, match=r'the label \[unknown\] is not in the \[columns\]'): Surv.from_dataframe('unknown', 'time', data) with pytest.raises( KeyError, match=r'the label \[unknown\] is not in the \[columns\]'): Surv.from_dataframe('event', 'unknown', data)