def test_aalen_additive_fit_with_censor(self): # this is a visual test of the fitting the cumulative # hazards. matplotlib = pytest.importorskip("matplotlib") from matplotlib import pyplot as plt n = 2500 d = 6 timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) X.columns = coef.columns cumulative_hazards = pd.DataFrame(cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns) T = generate_random_lifetimes(hz, timeline) X['T'] = T X['E'] = np.random.binomial(1, 0.99, n) aaf = AalenAdditiveFitter() aaf.fit(X, 'T', 'E') for i in range(d + 1): ax = plt.subplot(d + 1, 1, i + 1) col = cumulative_hazards.columns[i] ax = cumulative_hazards[col].ix[:15].plot(legend=False, ax=ax) ax = aaf.plot(ix=slice(0, 15), ax=ax, columns=[col], legend=False) plt.show()
def test_aaf_panel_dataset_with_no_censorship(self): panel_dataset = load_panel_test() aaf = AalenAdditiveFitter() aaf.fit(panel_dataset, id_col='id', duration_col='t') expected = pd.Series([True] * 9, index=range(1, 10)) expected.index.name = 'id' assert_series_equal(aaf.event_observed, expected)
def test_predict_cumulative_hazard_inputs(self, data_pred1): aaf = AalenAdditiveFitter() aaf.fit(data_pred1, duration_col='t', event_col='E',) x = data_pred1.ix[:5].drop(['t', 'E'], axis=1) y_df = aaf.predict_cumulative_hazard(x) y_np = aaf.predict_cumulative_hazard(x.values) assert_frame_equal(y_df, y_np)
def test_using_a_custom_timeline_in_static_fitting(self, rossi): aaf = AalenAdditiveFitter() timeline = np.arange(10) aaf.fit(rossi, event_col='arrest', duration_col='week', timeline=timeline) npt.assert_array_equal(aaf.hazards_.index.values, timeline) npt.assert_array_equal(aaf.cumulative_hazards_.index.values, timeline) npt.assert_array_equal(aaf.variance_.index.values, timeline) npt.assert_array_equal(aaf.timeline, timeline)
def test_large_dimensions_for_recursion_error(self): n = 500 d = 50 X = pd.DataFrame(np.random.randn(n, d)) T = np.random.exponential(size=n) X['T'] = T aaf = AalenAdditiveFitter() aaf.fit(X, duration_col='T')
def test_aaf_panel_dataset(self): matplotlib = pytest.importorskip("matplotlib") from matplotlib import pyplot as plt panel_dataset = load_panel_test() aaf = AalenAdditiveFitter() aaf.fit(panel_dataset, id_col='id', duration_col='t', event_col='E') aaf.plot()
def test_using_a_custom_timeline_in_varying_fitting(self): panel_dataset = load_panel_test() aaf = AalenAdditiveFitter() timeline = np.arange(10) aaf.fit(panel_dataset, id_col='id', duration_col='t', timeline=timeline) npt.assert_array_equal(aaf.hazards_.index.values, timeline) npt.assert_array_equal(aaf.cumulative_hazards_.index.values, timeline) npt.assert_array_equal(aaf.variance_.index.values, timeline) npt.assert_array_equal(aaf.timeline, timeline)
def test_fit_methods_require_duration_col(self): X = load_regression_dataset() aaf = AalenAdditiveFitter() cph = CoxPHFitter() with pytest.raises(TypeError): aaf.fit(X) with pytest.raises(TypeError): cph.fit(X)
def test_predict_methods_in_regression_return_same_types(self): X = load_regression_dataset() aaf = AalenAdditiveFitter() cph = CoxPHFitter() aaf.fit(X, duration_col='T', event_col='E') cph.fit(X, duration_col='T', event_col='E') for fit_method in ['predict_percentile', 'predict_median', 'predict_expectation', 'predict_survival_function', 'predict_cumulative_hazard']: assert isinstance(getattr(aaf, fit_method)(X), type(getattr(cph, fit_method)(X)))
def test_prediction_methods_respect_index(self, data_pred2): x = data_pred2[['x1', 'x2']].ix[:3].sort_index(ascending=False) expected_index = pd.Index(np.array([3, 2, 1, 0])) cph = CoxPHFitter() cph.fit(data_pred2, duration_col='t', event_col='E') npt.assert_array_equal(cph.predict_partial_hazard(x).index, expected_index) npt.assert_array_equal(cph.predict_percentile(x).index, expected_index) npt.assert_array_equal(cph.predict_expectation(x).index, expected_index) aaf = AalenAdditiveFitter() aaf.fit(data_pred2, duration_col='t', event_col='E') npt.assert_array_equal(aaf.predict_percentile(x).index, expected_index) npt.assert_array_equal(aaf.predict_expectation(x).index, expected_index)
def test_fit_methods_can_accept_optional_event_col_param(self): X = load_regression_dataset() aaf = AalenAdditiveFitter() aaf.fit(X, 'T', event_col='E') assert_series_equal(aaf.event_observed.sort_index(), X['E'].astype(bool), check_names=False) aaf.fit(X, 'T') npt.assert_array_equal(aaf.event_observed.values, np.ones(X.shape[0])) cph = CoxPHFitter() cph.fit(X, 'T', event_col='E') assert_series_equal(cph.event_observed.sort_index(), X['E'].astype(bool), check_names=False) cph.fit(X, 'T') npt.assert_array_equal(cph.event_observed.values, np.ones(X.shape[0]))
def test_aalen_additive_median_predictions_split_data(self): # This tests to make sure that my median predictions statisfy # the prediction are greater than the actual 1/2 the time. # generate some hazard rates and a survival data set n = 2500 d = 5 timeline = np.linspace(0, 70, 5000) hz, coef, X = generate_hazard_rates(n, d, timeline) T = generate_random_lifetimes(hz, timeline) X['T'] = T # fit it to Aalen's model aaf = AalenAdditiveFitter() aaf.fit(X, 'T') # predictions T_pred = aaf.predict_median(X[list(range(6))]) assert abs((T_pred.values > T).mean() - 0.5) < 0.05
def test_nn_cumulative_hazard_will_set_cum_hazards_to_0(self, rossi): aaf = AalenAdditiveFitter(nn_cumulative_hazard=False) aaf.fit(rossi, event_col='arrest', duration_col='week') cum_hazards = aaf.predict_cumulative_hazard(rossi) assert (cum_hazards < 0).stack().mean() > 0 aaf = AalenAdditiveFitter(nn_cumulative_hazard=True) aaf.fit(rossi, event_col='arrest', duration_col='week') cum_hazards = aaf.predict_cumulative_hazard(rossi) assert (cum_hazards < 0).stack().mean() == 0
def test_aalen_additive_smoothed_plot(self, block): # this is a visual test of the fitting the cumulative # hazards. n = 2500 d = 3 timeline = np.linspace(0, 150, 5000) hz, coef, X = generate_hazard_rates(n, d, timeline) T = generate_random_lifetimes(hz, timeline) + 0.1 * np.random.uniform(size=(n, 1)) C = np.random.binomial(1, 0.8, size=n) X['T'] = T X['E'] = C # fit the aaf, no intercept as it is already built into X, X[2] is ones aaf = AalenAdditiveFitter(coef_penalizer=0.1, fit_intercept=False) aaf.fit(X, 'T', 'E') ax = aaf.smoothed_hazards_(1).iloc[0:aaf.cumulative_hazards_.shape[0] - 500].plot() ax.set_xlabel("time") ax.set_title('test_aalen_additive_smoothed_plot') self.plt.show(block=block) return
def test_aalen_additive_plot(self): # this is a visual test of the fitting the cumulative # hazards. n = 2500 d = 3 timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) T = generate_random_lifetimes(hz, timeline) C = np.random.binomial(1, 1., size=n) X['T'] = T X['E'] = C # fit the aaf, no intercept as it is already built into X, X[2] is ones aaf = AalenAdditiveFitter(coef_penalizer=0.1, fit_intercept=False) aaf.fit(X, 'T', 'E') ax = aaf.plot(iloc=slice(0, aaf.cumulative_hazards_.shape[0] - 100)) ax.set_xlabel("time") ax.set_title('test_aalen_additive_plot') self.plt.show() return
def test_aalen_additive_fit_no_censor(self, block): n = 2500 d = 6 timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) X.columns = coef.columns cumulative_hazards = pd.DataFrame(cumulative_integral(coef.values, timeline), index=timeline, columns=coef.columns) T = generate_random_lifetimes(hz, timeline) X['T'] = T X['E'] = np.random.binomial(1, 1, n) aaf = AalenAdditiveFitter() aaf.fit(X, 'T', 'E') for i in range(d + 1): ax = self.plt.subplot(d + 1, 1, i + 1) col = cumulative_hazards.columns[i] ax = cumulative_hazards[col].loc[:15].plot(legend=False, ax=ax) ax = aaf.plot(loc=slice(0, 15), ax=ax, columns=[col], legend=False) self.plt.title("test_aalen_additive_fit_no_censor") self.plt.show(block=block) return
def test_aaf_panel_dataset(self, block): panel_dataset = load_panel_test() aaf = AalenAdditiveFitter() aaf.fit(panel_dataset, id_col='id', duration_col='t', event_col='E') aaf.plot() self.plt.title("test_aaf_panel_dataset") self.plt.show(block=block) return
def test_duration_vector_can_be_normalized(self): df = load_kidney_transplant() t = df['time'] normalized_df = df.copy() normalized_df['time'] = (normalized_df['time'] - t.mean()) / t.std() for fitter in [CoxPHFitter(), AalenAdditiveFitter()]: # we drop indexs since aaf will have a different "time" index. hazards = fitter.fit( df, duration_col='time', event_col='death').hazards_.reset_index(drop=True) hazards_norm = fitter.fit( normalized_df, duration_col='time', event_col='death').hazards_.reset_index(drop=True) assert_frame_equal(hazards, hazards_norm)
def test_aalen_additive_fit_no_censor(self, block): n = 2500 d = 6 timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) X.columns = coef.columns cumulative_hazards = pd.DataFrame(cumulative_integral( coef.values, timeline), index=timeline, columns=coef.columns) T = generate_random_lifetimes(hz, timeline) X["T"] = T X["E"] = np.random.binomial(1, 1, n) aaf = AalenAdditiveFitter() aaf.fit(X, "T", "E") for i in range(d + 1): ax = self.plt.subplot(d + 1, 1, i + 1) col = cumulative_hazards.columns[i] ax = cumulative_hazards[col].loc[:15].plot(legend=False, ax=ax) ax = aaf.plot(loc=slice(0, 15), ax=ax, columns=[col], legend=False) self.plt.title("test_aalen_additive_fit_no_censor") self.plt.show(block=block) return
def test_penalizer_reduces_norm_of_hazards(self, rossi): from numpy.linalg import norm aaf_without_penalizer = AalenAdditiveFitter(coef_penalizer=0., smoothing_penalizer=0.) assert aaf_without_penalizer.coef_penalizer == aaf_without_penalizer.smoothing_penalizer == 0.0 aaf_without_penalizer.fit(rossi, event_col='arrest', duration_col='week') aaf_with_penalizer = AalenAdditiveFitter(coef_penalizer=10., smoothing_penalizer=10.) aaf_with_penalizer.fit(rossi, event_col='arrest', duration_col='week') assert norm(aaf_with_penalizer.cumulative_hazards_) <= norm(aaf_without_penalizer.cumulative_hazards_)
def test_input_column_order_is_equal_to_output_hazards_order(self, rossi): aaf = AalenAdditiveFitter() expected = ['fin', 'age', 'race', 'wexp', 'mar', 'paro', 'prio'] aaf.fit(rossi, event_col='arrest', duration_col='week') assert list(aaf.cumulative_hazards_.columns.drop('baseline')) == expected aaf = AalenAdditiveFitter(fit_intercept=False) expected = ['fin', 'age', 'race', 'wexp', 'mar', 'paro', 'prio'] aaf.fit(rossi, event_col='arrest', duration_col='week') assert list(aaf.cumulative_hazards_.columns) == expected
def test_crossval_for_aalen_add(self, data_pred2, data_pred1): aaf = AalenAdditiveFitter() for data_pred in [data_pred1, data_pred2]: mean_scores = [] for repeat in range(20): scores = k_fold_cross_validation(aaf, data_pred, duration_col='t', event_col='E', k=3) mean_scores.append(np.mean(scores)) expected = 0.90 msg = "Expected min-mean c-index {:.2f} < {:.2f}" assert np.mean(mean_scores) > expected, msg.format( expected, scores.mean())
def test_predict_cumulative_hazard_inputs(self, data_pred1): aaf = AalenAdditiveFitter() aaf.fit( data_pred1, duration_col='t', event_col='E', ) x = data_pred1.ix[:5].drop(['t', 'E'], axis=1) y_df = aaf.predict_cumulative_hazard(x) y_np = aaf.predict_cumulative_hazard(x.values) assert_frame_equal(y_df, y_np)
def test_prediction_methods_respect_index(self, data_pred2): x = data_pred2[['x1', 'x2']].ix[:3].sort_index(ascending=False) expected_index = pd.Index(np.array([3, 2, 1, 0])) cph = CoxPHFitter() cph.fit(data_pred2, duration_col='t', event_col='E') npt.assert_array_equal(cph.predict_partial_hazard(x).index, expected_index) npt.assert_array_equal(cph.predict_percentile(x).index, expected_index) npt.assert_array_equal(cph.predict(x).index, expected_index) npt.assert_array_equal(cph.predict_expectation(x).index, expected_index) aaf = AalenAdditiveFitter() aaf.fit(data_pred2, duration_col='t', event_col='E') npt.assert_array_equal(aaf.predict_percentile(x).index, expected_index) npt.assert_array_equal(aaf.predict(x).index, expected_index) npt.assert_array_equal(aaf.predict_expectation(x).index, expected_index)
for k,v in sales_dict.iteritems(): #investigate why some negative leftovers on certain valid dates , more repairs than sales ??? print v if v>0: data_events = np.append(data_events,np.zeros(v)) extra_var = np.append(extra_var,np.array([int(k.split('/')[1])]*v)) t=[] if len(data_events)==0: all_data.append([0]*19) continue data_events[data_events==0] = 60 C= data_events <60 aaf = AAF(penalizer=0.5) #extra_vars_df["winter"] = (extra_var == 1.0).astype(float) #extra_vars_df["spring"] = (extra_var == 2.0).astype(float) #extra_vars_df["summer"] = (extra_var == 3.0).astype(float) #extra_vars_df["fall"] = (extra_var == 4.0).astype(float) extra_vars_df[str(1)] = (extra_var == 1).astype(float) extra_vars_df[str(2)] = (extra_var == 2).astype(float) extra_vars_df[str(3)] = (extra_var == 3).astype(float) extra_vars_df[str(4)] = (extra_var == 4).astype(float) extra_vars_df[str(5)] = (extra_var == 5).astype(float) extra_vars_df[str(6)] = (extra_var == 6).astype(float) extra_vars_df[str(7)] = (extra_var == 7).astype(float) extra_vars_df[str(8)] = (extra_var == 8).astype(float) extra_vars_df[str(9)] = (extra_var == 9).astype(float)
def regression_models(self): return [CoxPHFitter(), AalenAdditiveFitter(), CoxPHFitter(strata=['race', 'paro', 'mar', 'wexp'])]
def test_dataframe_input_with_nonstandard_index(self): aaf = AalenAdditiveFitter() df = pd.DataFrame([(16, True, True), (1, True, True), (4, False, True)], columns=['duration', 'done_feeding', 'white'], index=['a', 'b', 'c']) aaf.fit(df, duration_col='duration', event_col='done_feeding')
def test_swapping_order_of_columns_in_a_df_is_okay(self, rossi): aaf = AalenAdditiveFitter() aaf.fit(rossi, event_col='arrest', duration_col='week') misorder = ['age', 'race', 'wexp', 'mar', 'paro', 'prio', 'fin'] natural_order = rossi.columns.drop(['week', 'arrest']) deleted_order = rossi.columns.difference(['week', 'arrest']) assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[misorder])) assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[deleted_order])) aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(rossi, event_col='arrest', duration_col='week') assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[misorder])) assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[deleted_order]))
# We will use the Using a Aallen's additive model to estimate the survival times in the testing set using a model built from a training set using 100 fold sub-sampling of the data. # In[33]: #Go through each training testing monteCarlo sampling and train/predict predictions=[] for i in range(trainLabels.shape[1]): X = patsy.dmatrix('age + grade + stage -1', clinical, return_type='dataframe') X['T'] = clinical['OS_OS'] X['C'] = clinical['OS_vital_status'] trainX = X.ix[trainLabels[i],:].reset_index() testX = X.ix[testLabels[i],:].reset_index() #Build model and train aaf = AalenAdditiveFitter(penalizer=1., fit_intercept=True) aaf.fit(trainX.drop(['index'], axis=1), duration_col='T', event_col='C',show_progress=False) #Predict on testing data median = aaf.predict_median(testX.drop(['T','C', 'index'], axis=1)) median.index = testX['index'] predictions.append(median.replace([np.inf, -np.inf, np.nan], 0)) # ###Saving Results to Synapse and ask Synapse to evaluate our predictions # To document what we have done we will start by storing this code in Synapse as a file Entity. # In[34]: codeEntity = synapseclient.File('tcga_survival_analysis.py', parentId='syn1720423') codeEntity = syn.store(codeEntity)
# -*- coding: utf-8 -*- # aalen additive if __name__ == "__main__": import pandas as pd import time from lifelines.estimation import AalenAdditiveFitter from lifelines.datasets import load_rossi df = load_rossi() df = pd.concat([df] * 5).reset_index(drop=True) print("Size: ", df.shape) aaf = AalenAdditiveFitter() start_time = time.time() aaf.fit(df, duration_col="week", event_col="arrest") print("--- %s seconds ---" % (time.time() - start_time)) print(aaf.score_)
# In[33]: #Go through each training testing monteCarlo sampling and train/predict predictions = [] for i in range(trainLabels.shape[1]): X = patsy.dmatrix('age + grade + stage -1', clinical, return_type='dataframe') X['T'] = clinical['OS_OS'] X['C'] = clinical['OS_vital_status'] trainX = X.ix[trainLabels[i], :].reset_index() testX = X.ix[testLabels[i], :].reset_index() #Build model and train aaf = AalenAdditiveFitter(penalizer=1., fit_intercept=True) aaf.fit(trainX.drop(['index'], axis=1), duration_col='T', event_col='C', show_progress=False) #Predict on testing data median = aaf.predict_median(testX.drop(['T', 'C', 'index'], axis=1)) median.index = testX['index'] predictions.append(median.replace([np.inf, -np.inf, np.nan], 0)) # ###Saving Results to Synapse and ask Synapse to evaluate our predictions # To document what we have done we will start by storing this code in Synapse as a file Entity. # In[34]: codeEntity = synapseclient.File('tcga_survival_analysis.py',
def test_swapping_order_of_columns_in_a_df_is_okay(self): rossi = load_rossi() aaf = AalenAdditiveFitter() aaf.fit(rossi, event_col='week', duration_col='arrest') misorder = ['age', 'race', 'wexp', 'mar', 'paro', 'prio', 'fin'] natural_order = rossi.columns.drop(['week', 'arrest']) deleted_order = rossi.columns - ['week', 'arrest'] assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[misorder])) assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[deleted_order])) aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(rossi, event_col='week', duration_col='arrest') assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[misorder])) assert_frame_equal(aaf.predict_median(rossi[natural_order]), aaf.predict_median(rossi[deleted_order]))