has_children = cph_data.loc[cph_data['Has_Children'] == 1] does_not_have_children = cph_data.loc[cph_data['Has_Children'] == 0] kmf0 = KaplanMeierFitter() kmf0.fit(Educaton_less12['Duration'], event_observed=Educaton_less12['Divorce']) kmf1 = KaplanMeierFitter() kmf1.fit(Educaton_12['Duration'], event_observed=Educaton_12['Divorce']) kmf2 = KaplanMeierFitter() kmf2.fit(Educaton_16plus['Duration'], event_observed=Educaton_16plus['Divorce']) fig, axes = plt.subplots() kmf0.plot_loglogs(ax=axes) kmf1.plot_loglogs(ax=axes) kmf2.plot_loglogs(ax=axes) axes.legend(['Less Than 12', '12 years', '16 years and above']) plt.savefig( '/home/raed/Dropbox/INSE - 6320/Final Project/CPH_Pproportional_Hazards_Assumption.pdf' ) plt.show() kmf3 = KaplanMeierFitter() kmf3.fit(has_children['Duration'], event_observed=has_children['Divorce']) kmf4 = KaplanMeierFitter() kmf4.fit(does_not_have_children['Duration'], event_observed=does_not_have_children['Divorce'])
# plot the logs curve: the loglogs (-log(survival curve)) vs log(time). If the curves are parallel (and hence do not cross each other), then it’s likely the variable satisfies proportional hzrd assumption. If curves do cross, likely must “stratify” the variable ts = int(df_cox.shape[0] * 0.1) ts = 4 for train, test, trial_i in utility.train_test_splitter(df_cox, 5, ts, key='ORIG_DTE'): break T, E = train['AGE'], train['did_dflt'] # ORIG_CHN ax = plt.subplot() for chn in train.ORIG_CHN.unique(): is_chn = (df_cox.ORIG_CHN == chn) kmf.fit(T[is_chn], event_observed=E[is_chn], label=chn) kmf.plot_loglogs(ax=ax) # PURPOSE ax = plt.subplot() for purpose in train.PURPOSE.unique(): is_pur = (train.PURPOSE == purpose) kmf.fit(T[is_pur], event_observed=E[is_pur], label=purpose) kmf.plot_loglogs(ax=ax) # fit cph model = models.PHR(train, test, formula, 'did_dflt', 'LOAN_ID') model.fit_model() # 2. compare baseline of cph with kmf to see significance of covs ax = plt.subplot() kmf.fit(train['AGE'], event_observed=train['did_dflt'])
fig = plt.figure(figsize=(10, 12)) plt.subplots_adjust(hspace=0.4) for i, continent in enumerate(countries): continent_data = data_not_empty[data_not_empty['continent'] == continent] other_data = data_not_empty[data_not_empty['continent'] != continent] kmf_continent = KaplanMeierFitter() kmf_continent.fit(continent_data['duration'], continent_data['observed']) kmf_other = KaplanMeierFitter() kmf_other.fit(other_data['duration'], other_data['observed']) ax = fig.add_subplot(3, 2, i + 1) ax.set_title('{} vs other'.format(continent)) kmf_continent.plot_loglogs(ax=ax, label=continent) kmf_other.plot_loglogs(ax=ax, label='other') fig.show() data_financial = merged[merged['branch'] == 'F'] data_other = merged[merged['branch'] == 'O'] kmf_financial = KaplanMeierFitter() kmf_financial.fit(data_financial['duration'], data_financial['observed']) kmf_other = KaplanMeierFitter() kmf_other.fit(data_other['duration'], data_other['observed']) fig, axes = plt.subplots() kmf_financial.plot_loglogs(ax=axes)
from lifelines import CoxPHFitter cph = CoxPHFitter() # 传入用作自变量的列 cph_bladder_df = bladder[['rx', 'number', 'size', 'enum', 'stop', 'event']] cph.fit(cph_bladder_df, duration_col='stop', event_col='event') # 输出系数 print(cph.print_summary()) rx1 = bladder.loc[bladder['rx'] == 1] rx2 = bladder.loc[bladder['rx'] == 2] kmf1 = KaplanMeierFitter() kmf1.fit(rx1['stop'], event_observed=rx1['event']) kmf2 = KaplanMeierFitter() kmf2.fit(rx2['stop'], event_observed=rx2['event']) fig, axes = plt.subplots() kmf1.plot_loglogs(ax=axes) kmf2.plot_loglogs(ax=axes) axes.legend(['rx1', 'rx2']) plt.show() cph_start = CoxPHFitter() cph_start.fit(cph_bladder_df, duration_col='stop', event_col='event', strata=['rx']) print(cph_start.print_summary())