def test_aalen_additive_fit_with_censor(self, block): n = 2500 d = 6 timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) X.columns = coef.columns cumulative_hazards = pd.DataFrame(cumulative_integral( coef.values, timeline), index=timeline, columns=coef.columns) T = generate_random_lifetimes(hz, timeline) T[np.isinf(T)] = 10 X["T"] = T X["E"] = np.random.binomial(1, 0.99, n) aaf = AalenAdditiveFitter() aaf.fit(X, "T", "E") for i in range(d + 1): ax = self.plt.subplot(d + 1, 1, i + 1) col = cumulative_hazards.columns[i] ax = cumulative_hazards[col].loc[:15].plot(ax=ax) ax = aaf.plot(loc=slice(0, 15), ax=ax, columns=[col]) self.plt.title("test_aalen_additive_fit_with_censor") self.plt.show(block=block) return
def Aalen_model(df, l2=0.01, coeff_pen=0.1, smooth_pen=0.1): '''Invokes the Aalen Additive Fitter class to creat an instance that fits the regression model: hazard(t) = b_0(t) + b_1(t)*x_1 + ... + b_N(t)*x_N i.e., the hazard rate is a linear function of the covariates. Parameters df: Pandas dataframe. The y column must be called "Total_years." A column of Boolean values called "censored" to indicate which row of data is censored, as indicated by True or False or 1 or 0. coeff_pen = 0.1: Attach a L2 penalizer to the size of the coeffcients during regression. This improves stability of the estimates and controls for high correlation between covariates. For example, this shrinks the absolute value of c_{i,t}. Recommended, even if a small value. Smoothing_penalizer = 0.1: Attach a L2 penalizer to difference between adjacent (over time) coefficents. For example, this shrinks the absolute value of c_{i,t} - c_{i,t+1}. Other built-in, unadjustable parameters: Intercept = False. We suggest adding a column of 1 to model the baseline hazard. nn_cumulative_hazard = True: In its True state, it forces the the negative hazard values to be zero Output: aaf instance fitted to df''' aaf = AalenAdditiveFitter(fit_intercept=False, coef_penalizer=coeff_pen, smoothing_penalizer=smooth_pen, nn_cumulative_hazard=True) aaf.fit(df, 'Total_years', event_col='censored') return aaf
def test_aaf_panel_dataset(self, block): panel_dataset = load_panel_test() aaf = AalenAdditiveFitter() aaf.fit(panel_dataset, id_col="id", duration_col="t", event_col="E") aaf.plot() self.plt.title("test_aaf_panel_dataset") self.plt.show(block=block) return
def fit(self, X, y, **fit_params): X_ = X.copy() X_[self.duration_column]=y[self.duration_column] if self.event_col is not None: X_[self.event_col] = y[self.event_col] params = self.get_params() est = AalenAdditiveFitter(**params) est.fit(X_, duration_col=self.duration_column, event_col=self.event_col, timeline=self.timeline, id_col = self.id_col, **fit_params) self.estimator = est return self
def fit(self, X, y, **fit_params): X_ = X.copy() X_[self.duration_column] = y[self.duration_column] if self.event_col is not None: X_[self.event_col] = y[self.event_col] params = self.get_params() est = AalenAdditiveFitter(**params) est.fit(X_, duration_col=self.duration_column, event_col=self.event_col, timeline=self.timeline, id_col=self.id_col, **fit_params) self.estimator = est return self
def fit(self, X, y, **fit_params): X_ = X.copy() X_[self.duration_column] = y[self.duration_column] if self.event_col is not None: X_[self.event_col] = y[self.event_col] est = AalenAdditiveFitter(fit_intercept=self.fit_intercept, alpha=self.alpha, coef_penalizer=self.coef_penalizer, smoothing_penalizer=self.smoothing_penalizer) est.fit(X_, duration_col=self.duration_column, event_col=self.event_col, timeline=self.timeline, id_col=self.id_col, **fit_params) self.estimator = est return self
def run_survival_curve(self, df): ''' used for testing only''' aaf = AalenAdditiveFitter() modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE' X = pt.dmatrix(modelspec, df, return_type='dataframe') X = X.join(df[['SRV_TIME_MON','CENSORED']]) aaf.fit(X, 'SRV_TIME_MON', 'CENSORED') # INSERT VALUES TO TEST HERE test = np.array([[ 1., 1961., 52., 0, 0., 2., 1., 0., 4., 2.]]) aaf.predict_survival_function(test).plot(); plt.show() exp = aaf.predict_expectation(test) print(exp) return
def prepare_model(self): # get the data and clean it df, dep = self.load_and_clean_data() # create the model aaf = AalenAdditiveFitter() # define fields for the model modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE' X = pt.dmatrix(modelspec, df, return_type='dataframe') X = X.join(df[['SRV_TIME_MON','CENSORED']]) # fit the model if self.verbose: print('Creating Aalen Additive Model') aaf.fit(X, 'SRV_TIME_MON', 'CENSORED') return aaf
def prepare_model(self): # get the data and clean it df, dep = self.load_and_clean_data() # create the model aaf = AalenAdditiveFitter() # define fields for the model modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE' X = pt.dmatrix(modelspec, df, return_type='dataframe') X = X.join(df[['SRV_TIME_MON', 'CENSORED']]) # fit the model if self.verbose: print('Creating Aalen Additive Model') aaf.fit(X, 'SRV_TIME_MON', 'CENSORED') return aaf
def test_aalen_additive_smoothed_plot(self, block): # this is a visual test of the fitting the cumulative # hazards. n = 2500 d = 3 timeline = np.linspace(0, 150, 5000) hz, coef, X = generate_hazard_rates(n, d, timeline) T = generate_random_lifetimes(hz, timeline) + 0.1 * np.random.uniform(size=(n, 1)) C = np.random.binomial(1, 0.8, size=n) X["T"] = T X["E"] = C # fit the aaf, no intercept as it is already built into X, X[2] is ones aaf = AalenAdditiveFitter(coef_penalizer=0.1, fit_intercept=False) aaf.fit(X, "T", "E") ax = aaf.smoothed_hazards_(1).iloc[0 : aaf.cumulative_hazards_.shape[0] - 500].plot() ax.set_xlabel("time") ax.set_title("test_aalen_additive_smoothed_plot") self.plt.show(block=block) return
def run_survival_curve(self, df): ''' used for testing only''' aaf = AalenAdditiveFitter() modelspec = 'YR_BRTH + AGE_DX + RADIATN + HISTREC + ERSTATUS + PRSTATUS + BEHANAL + HST_STGA + NUMPRIMS + RACE' X = pt.dmatrix(modelspec, df, return_type='dataframe') X = X.join(df[['SRV_TIME_MON', 'CENSORED']]) aaf.fit(X, 'SRV_TIME_MON', 'CENSORED') # INSERT VALUES TO TEST HERE test = np.array([[1., 1961., 52., 0, 0., 2., 1., 0., 4., 2.]]) aaf.predict_survival_function(test).plot() plt.show() exp = aaf.predict_expectation(test) print(exp) return
def fit(self, R, y=None, Thetas=dict()): """ R: dict Relations between types ranks: dict Number of latent factors in which to decompose the relations y: array-like, dimensions (n x 1) or (n x 2) The first column is the time to predict. The second column is optional and is the event we are predicting. """ self.random_state = check_random_state(self.random_state) if self.verbose: print("Fitting data fusion procedure") DFMF.fit(self, R, Thetas) if y is not None: y = self._check_y(y) self.outputs_ = y t1, t2 = self.predictive_relationship factors = ["Factor " + str(i) for i in range(1, self.ranks[t2] + 1)] x = self.G_[t1, t1].dot(self.S_[t1, t2][0]) X = pd.DataFrame(x, columns=factors) if y is not None: self.regression_ = True if self.verbose: print("Fitting Aalen additive model") if y.shape[1] == 2: X['T'] = y[:, 0] X['E'] = y[:, 1] AalenAdditiveFitter.fit(self, X, 'T', event_col='E') else: X['T'] = y AalenAdditiveFitter.fit(self, X, 'T') self._fit_Kaplan_Meier() else: self.regression_ = False return self
def test_aalen_additive_plot(self, block): # this is a visual test of the fitting the cumulative # hazards. n = 2500 d = 3 timeline = np.linspace(0, 70, 10000) hz, coef, X = generate_hazard_rates(n, d, timeline) T = generate_random_lifetimes(hz, timeline) T[np.isinf(T)] = 10 C = np.random.binomial(1, 1.0, size=n) X["T"] = T X["E"] = C # fit the aaf, no intercept as it is already built into X, X[2] is ones aaf = AalenAdditiveFitter(coef_penalizer=0.1, fit_intercept=False) aaf.fit(X, "T", "E") ax = aaf.plot(iloc=slice(0, aaf.cumulative_hazards_.shape[0] - 100)) ax.set_xlabel("time") ax.set_title("test_aalen_additive_plot") self.plt.show(block=block) return
def Aalen_model(df, l2 = 0.01, coeff_pen = 0.1, smooth_pen = 0.1): '''Invokes the Aalen Additive Fitter class to creat an instance that fits the regression model: hazard(t) = b_0(t) + b_1(t)*x_1 + ... + b_N(t)*x_N i.e., the hazard rate is a linear function of the covariates. Parameters df: Pandas dataframe. The y column must be called "Total_years." A column of Boolean values called "censored" to indicate which row of data is censored, as indicated by True or False or 1 or 0. coeff_pen = 0.1: Attach a L2 penalizer to the size of the coeffcients during regression. This improves stability of the estimates and controls for high correlation between covariates. For example, this shrinks the absolute value of c_{i,t}. Recommended, even if a small value. Smoothing_penalizer = 0.1: Attach a L2 penalizer to difference between adjacent (over time) coefficents. For example, this shrinks the absolute value of c_{i,t} - c_{i,t+1}. Other built-in, unadjustable parameters: Intercept = False. We suggest adding a column of 1 to model the baseline hazard. nn_cumulative_hazard = True: In its True state, it forces the the negative hazard values to be zero Output: aaf instance fitted to df''' aaf = AalenAdditiveFitter(fit_intercept=False, coef_penalizer=coeff_pen, smoothing_penalizer=smooth_pen, nn_cumulative_hazard=True) aaf.fit(df, 'Total_years', event_col='censored') return aaf
# -*- coding: utf-8 -*- # aalen additive if __name__ == "__main__": import pandas as pd import time from lifelines import AalenAdditiveFitter from lifelines.datasets import load_rossi df = load_rossi() df = pd.concat([df] * 5).reset_index(drop=True) print("Size: ", df.shape) aaf = AalenAdditiveFitter() start_time = time.time() aaf.fit(df, duration_col="week", event_col="arrest") print("--- %s seconds ---" % (time.time() - start_time)) print(aaf.score_)
df = df[df['Duration'] != 0] df2 = df.loc[:, [ 'DISTRIBUTION CHANNEL', 'GENDER', 'SMOKER STATUS', 'PremiumPattern', 'BENEFITS TYPE', 'BROKER COMM' ]] #df2 = df.loc[:, ['GENDER', 'SMOKER STATUS', 'PremiumPattern']] #df2 = df.loc[:, ['SMOKER STATUS', 'GENDER']] df2 = pd.get_dummies(df2) #T = df['Duration'] E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T aaf = AalenAdditiveFitter() aaf.fit(df2, 'T', event_col='E', show_progress=True) pickle.dump(aaf, open('Smoker_Gender_All.pkl', 'wb')) aaf.plot() #cph = CoxPHFitter() #cph.fit(df2, duration_col='T', event_col='E', show_progress=True, strata=['SMOKER STATUS_No','SMOKER STATUS_Yes', # 'GENDER_F', 'GENDER_M']) #pickle.dump(cph, open('Smoker_Gender_CPF.pkl', 'wb')) #cph.plot()
coef exp(coef) se(coef) z p lower 0.95 upper 0.95 var1 0.2213 1.2477 0.0743 2.9796 0.0029 0.0757 0.3669 ** var2 0.0509 1.0522 0.0829 0.6139 0.5393 -0.1116 0.2134 var3 0.2186 1.2443 0.0758 2.8836 0.0039 0.0700 0.3672 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Concordance = 0.580 """ cph.plot() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, duration_col='T', event_col='E') aaf.plot() X = regression_dataset.drop(['E', 'T'], axis=1) aaf.predict_survival_function( X.iloc[10:12]).plot() # get the unique survival functions of two subjects scores = k_fold_cross_validation(cph, regression_dataset, duration_col='T', event_col='E', k=10) print(scores) print(np.mean(scores)) print(np.std(scores))
naf = NelsonAalenFitter() naf.fit(T, event_observed=E) #but instead of a survival_function_ being exposed, a cumulative_hazard_ is. #Survival Regression from lifelines.datasets import load_regression_dataset regression_dataset = load_regression_dataset() regression_dataset.head() from lifelines import AalenAdditiveFitter, CoxPHFitter # Using Cox Proportional Hazards model cf = CoxPHFitter() cf.fit(regression_dataset, 'T', event_col='E') cf.print_summary() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, 'T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E', 'T']] aaf.predict_survival_function(x.ix[10:12]).plot( ) #get the unique survival functions of the first two subjects aaf.plot()
from lifelines.datasets import generate_regression_dataset regression_dataset = generate_regression_dataset() from lifelines import AalenAdditiveFitter, CoxPHFitter cf = CoxPHFitter() cf.fit(regression_dataset, duration_col='T', event_col='E') aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, duration_col='T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E','T']] aaf.predict_survival_function(x.ix[10:12]).plot() aaf.plot()
def aalen_aditive(in_df): assert (not in_df.isnull().values.any()) aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(in_df, 'LivingDays', event_col='Dead')
from lifelines import AalenAdditiveFitter, CoxPHFitter # Using Cox Proportional Hazards model cf = CoxPHFitter() cf.fit(regression_dataset, 'T', event_col='E') cf.print_summary() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, 'T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E','T']] aaf.predict_survival_function(x.ix[10:12]).plot() #get the unique survival functions of the first two subjects aaf.plot()
plt.xlim(0, 80) plt.tight_layout() plt.savefig( '/home/raed/Dropbox/INSE - 6320/Final Project/Cumulative_Hazard_for_each_State.pdf' ) plt.show() #Survival Regression using the following covariates : Couple Race, Income Range, State and Marriage Date X = patsy.dmatrix( 'State + Couple_Race + Household_Income_Range + Husband_Education + Husband_Race + Marriage_Date -1', data, return_type='dataframe') aaf = AalenAdditiveFitter(coef_penalizer=1.0, fit_intercept=True) X['T'] = data['Duration'] X['E'] = data['Divorce'] aaf.fit(X, 'T', event_col='E') aaf.cumulative_hazards_.head() sns.set() aaf.plot(columns=[ 'State[Alabama]', 'baseline', 'Couple_Race[T.Same-Race]', 'Household_Income_Range[T.42,830$ - 44,765$]' ], ix=slice(1, 15)) plt.savefig( '/home/raed/Dropbox/INSE - 6320/Final Project/Survival_Regression_for_Alabamae.pdf' ) plt.show() aaf.plot(columns=[ 'State[Mississippi]', 'baseline', 'Couple_Race[T.Same-Race]',
def kfoldcv(data, k=5, m=10, penalizer=0.5, timeinterval=np.linspace(1,20,20), duration_col='ndays_act', event_col='observed'): """ Trains data with AalenAdditiveFitter and (k-fold) cross validate it. Based on lifelines library for survival analysis in Python. data: Pandas dataframe. k: number of folds m: number of time units to be included in the cross validation penalizer: argument of class AalenAdditiveFitter (lifelines library) timeinterval: argument of AalenAdditiveFitter().fit method. Time points that are fitted. duration_col: last column from data. It contains the lifetime of each case. event_col: second-to-last column from data. It contains the censorships. So far this function only works without censorships, that is, all death events must be observed. Therefore, it must be a column of ones. Prints: Average relative error of the predicted probabilities. """ aaf = AalenAdditiveFitter(penalizer=penalizer) n, d = data.shape data = data.copy() data = data.reindex(np.random.permutation(data.index)).sort(event_col) scores = [] assignments = np.array((n // k + 1) * list(range(1, k + 1))) assignments = assignments[:n] testing_columns = data.columns - [duration_col, event_col] for i in range(1, k + 1): ix = assignments == i training_data = data.ix[~ix] testing_data = data.ix[ix] T_actual = testing_data[duration_col].values E_actual = testing_data[event_col].values X_testing = testing_data[testing_columns] aaf.fit(training_data, duration_col=duration_col, event_col=event_col, timeline=timeinterval) used_ind = [] prec_sum = 0 rel_sum = 0 rel_error_list = [] df = testing_data #ndays must be the last column, and observed the second-to-last for j,row in df.iterrows(): if not j in used_ind: a = df[np.all(df.ix[:,0:-2].values==df.ix[j,:-2].values, axis=1)] list_ = a.index.tolist() used_ind += list_ actual_rate_series = a.ndays_act.value_counts() / a.shape[0] mini = min(actual_rate_series.shape[0], m) actual_rate = np.array(actual_rate_series)[:mini] pred_df = aaf.predict_survival_function(a.iloc[0,:-2][None,:]) pred_array = np.array(pred_df) pred_rate = np.zeros(mini) pred_rate[0] = 1 - pred_array[0] for alpha in range(1, mini): pred_rate[alpha] = pred_array[alpha-1] - pred_array[alpha] maxi = np.maximum(pred_rate, actual_rate) rel_error = np.abs(pred_rate - actual_rate) / maxi rel_error_list.append(rel_error) succes_rate = len(rel_error[rel_error <= 0.15]) / mini prec_sum += succes_rate * len(a) rel_sum += np.sum(rel_error) / mini * len(a) precision = prec_sum / df.shape[0] relative = rel_sum / df.shape[0] scores.append(precision) print "Average relative error: ", relative