def fun(epsilon): li = [] for kk in range(100): newdata_= laplace_mechanism(his , np.sqrt(2.0) / epsilon) newdata = [max([0.0, d]) for d in newdata_] ntime = np.asarray([]) nevent = np.asarray([]) for i in range(bins0): ntime = np.append(ntime, np.linspace(bin_edges0[i], bin_edges0[i+1] , newdata[i])) #ntime = np.append(ntime, np.ones(newdata[i]) * 0.5 * (bin_edges0[i+1] + bin_edges0[i] )) # , newdata[i])) nevent = np.append(nevent,np.zeros(newdata[i])) for i in range(bins1): ntime = np.append(ntime,np.linspace(bin_edges1[i], bin_edges1[i+1], newdata[bins0 + i])) #ntime = np.append(ntime, np.ones(newdata[bins0 + i]) * 0.5 * (bin_edges1[i+1] + bin_edges1[i] )) # , newdata[i])) nevent = np.append(nevent, np.ones(newdata[bins0+i])) kmf1 = KaplanMeierFitter() kmf1.fit(ntime, event_observed=nevent) #naf1.fit(ntime, event_observed=nevent) out = kmf1.predict(kmf.timeline) #pyplot.plot (naf1.timeline, naf1.cumulative_hazard_.values) #pyplot.plot (naf.timeline, naf.cumulative_hazard_.values) #pyplot.show() mre = ( np.linalg.norm(out - true_value[:,0]) / np.linalg.norm(true_value[:,0]) ) li.append(mre) avg = np.average( li ) #mean_relative_error.append(avg) print "(%f, %f)" % (epsilon, avg)
def kaplan_meier(out, t, ttype): def make_label(ttype, nobs): return "Rand%d; %d obs." % (ttype, nobs) kmf = KaplanMeierFitter() kmf.fit(t, event_observed=out, label=make_label(ttype=ttype, nobs=len(out))) return kmf
def plot_Kaplan_Meier_feature(donor_dataset): '''Accepts a dataframe of donor data. For each feature (column), it plots the Kaplan-Meier curves of the donors based on whether the feature is true or false. The active donors ('censored') will be excluded from the plot. Parameters: donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'. 'Total_years' represents how many years the donors have been active. 'censored' indicates whether a donor is still active (True = active donor). Output: Kaplan-Meier plot(s). This function does not return anything. ''' T = donor_dataset['Total_years'] C = donor_dataset['censored'] features = list(donor_dataset.columns) features.remove('Total_years') features.remove('censored') features.remove('Baseline') kmf = KaplanMeierFitter() for feature in features: Above_mean = donor_dataset[feature] > donor_dataset[donor_dataset['censored'] == 0][feature].mean() fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) kmf = KaplanMeierFitter() kmf.fit(T[Above_mean], C[Above_mean], label = feature + ': Yes or > mean') kmf.plot(ax=ax, linewidth = 2) kmf.fit(T[~Above_mean], C[~Above_mean], label = feature + ': No or < mean') kmf.plot(ax=ax, linewidth = 2) ax.set_xlabel('Years', size = 10) ax.set_ylabel('Surviving donor population', size = 10) ax.set_xlim(0,40) ax.set_ylim(0, 1) ax.grid() ax.legend(loc = 'upper right', fontsize = 10) plt.show()
def survival_analysis(dataframe, grouping, years = 5): # remove patients with null values df2 = dataframe.dropna(subset = [grouping]) df2 = df2.dropna(subset = ['_OS']) df2 = df2.dropna(subset = ['_EVENT']) # limit analysis to number of years specified df2['survival'] = np.nan df2['event'] = np.nan maxtime = years * 365 df2['survival'][(df2['_OS'] > maxtime)] = maxtime df2['event'][(df2['_OS'] > maxtime)] = 0 df2['survival'][(df2['_OS'] <= maxtime)] = df2['_OS'] df2['event'][(df2['_OS'] <= maxtime)] = df2['_EVENT'] # get groups grouped_data = df2.groupby(grouping) unique_groups = list(grouped_data.groups.keys()) unique_groups.sort() #plot survival curve kmf = KaplanMeierFitter() ax = plt.subplot(111) for i, group in enumerate(unique_groups): data = grouped_data.get_group(group) kmf.fit(data['survival'], data['event'], label = group) # print(data['_OS']) kmf.plot(ax=ax, show_censors = True) plt.show()
def __KM_analysis(self,duration_table,expressed_array,unexpressed_array,freq_set): data = {} expressed_T = [] expressed_C = [] unexpressed_T = [] unexpressed_C = [] for idx,row in enumerate(duration_table): if(idx>0): if row[0] in unexpressed_array and row[1] != "NA" and row[2] != "NA": unexpressed_T.append(float(row[1])) unexpressed_C.append(int(row[2])) elif row[0] in expressed_array and row[1] != "NA" and row[2] != "NA": expressed_T.append(float(row[1])) expressed_C.append(int(row[2])) results = logrank_test(expressed_T, unexpressed_T, expressed_C, unexpressed_C, alpha=.95 ) if(results.p_value < .0006): ax = plt.subplot(111) kmf = KaplanMeierFitter() kmf.fit(expressed_T, event_observed=expressed_C, label="Satisfying") kmf.plot(ax=ax, ci_force_lines=False) kmf.fit(unexpressed_T, event_observed=unexpressed_C, label="None-Satisfying") kmf.plot(ax=ax, ci_force_lines=False) plt.ylim(0,1) plt.title("Lifespans ("+str(freq_set)+")") plt.show() return results.p_value
def plot_survival_curves(rec_t, rec_e, antirec_t, antirec_e, experiment_name = '', output_file = None): # Set-up plots plt.figure(figsize=(12,3)) ax = plt.subplot(111) # Fit survival curves kmf = KaplanMeierFitter() kmf.fit(rec_t, event_observed=rec_e, label=' '.join([experiment_name, "Recommendation"])) kmf.plot(ax=ax,linestyle="-") kmf.fit(antirec_t, event_observed=antirec_e, label=' '.join([experiment_name, "Anti-Recommendation"])) kmf.plot(ax=ax,linestyle="--") # Format graph plt.ylim(0,1); ax.set_xlabel('Timeline (months)',fontsize='large') ax.set_ylabel('Percentage of Population Alive',fontsize='large') # Calculate p-value results = logrank_test(rec_t, antirec_t, rec_e, antirec_e, alpha=.95) results.print_summary() # Location the label at the 1st out of 9 tick marks xloc = max(np.max(rec_t),np.max(antirec_t)) / 9 if results.p_value < 1e-5: ax.text(xloc,.2,'$p < 1\mathrm{e}{-5}$',fontsize=20) else: ax.text(xloc,.2,'$p=%f$' % results.p_value,fontsize=20) plt.legend(loc='best',prop={'size':15}) if output_file: plt.tight_layout() pylab.savefig(output_file)
def kmplot(df_high, df_low): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def surAnalysis(storeId): duration = [] observed = [] for elem in survival.find({'store_id':storeId}): duration.append(elem['duration']/86400) observed.append(elem['observed']) if duration==[]: pass else: dura_obj = array(duration) obs_obj = array(observed) kmf = KaplanMeierFitter() kmf.fit(dura_obj,obs_obj) ax = kmf.plot() #ax.set_xlim(0,1) #ax.set_ylim(0.85,1.0) ax.get_figure().savefig('F:\workshop\lbs_lyf\static\images\\' + storeId) plt.close(ax.get_figure())
def generate_plot(): # Perhaps `regenerate_plot`? """ Dynamically fit and plot a Kaplan-Meier curve. """ df_ = df.copy() # Use constraints for index in range(len(categories)): if index not in category_select.active: df_ = df_[df_.category != category_select.labels[index]] df_ = df_[min_size_select.value <= df_['size']] df_ = df_[df_['size'] <= max_size_select.value] df_ = df_[min_age_select.value <= df_.age] df_ = df_[df_.age <= max_age_select.value] if 0 not in sex_select.active: # Male df_ = df_[df_.sex != 1] if 1 not in sex_select.active: # Female df_ = df_[df_.sex != 2] if len(df_) == 0: # Bad constraints status.text = 'No cases found. Try different constraints.' return doa = [not survived for survived in df_.survived] kmf = KaplanMeierFitter() fit = kmf.fit(df_.days, event_observed=doa, label='prob_of_surv') # Here, we are using the smoothed version of the Kaplan-Meier curve # The stepwise version would work just as well data, surv_func = renderer.data_source.data, fit.survival_function_ data.update(x=surv_func.index, y=surv_func.prob_of_surv) start, end = 0, max(df_.days) # bounds='auto' doesn't work? plot.x_range.update(start=start, end=end, bounds=(start, end)) status.text = '{} cases found.'.format(len(df_))
def plot_Kaplan_Meier_overall(donor_dataset): '''Accepts a dataframe of donor data. Plots the overall Kaplan-Meier curve based of the lifetime of the donors. The active donors ('censored') will be excluded from the plot. Parameters: donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'. 'Total_years' represents how many years the donors have been active. 'censored' indicates whether a donor is still active (True = active donor). Output: A Kaplan-Meier plot. This function does not return anything. ''' #This produces two data frames of the columns 'Total_years' #and 'censored.' The former indicates how manay years a #donor has donoted before she/he churned. The latter indicates #whether the donor is censored (not churned). Only donor who #has churned (not censored) are used because we don't know the #'Total_years' of donors who have not churned yet. T = donor_dataset['Total_years'] C = donor_dataset['censored'] #Create KaplanMeierInstance kmf = KaplanMeierFitter() kmf.fit(T, C, label = 'Overall') #plot KM function fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) kmf.plot(ax=ax) ax.set_xlabel('Years', size = 20) ax.set_ylabel('Surviving donor population', size = 20) ax.set_xlim(0,40) ax.set_ylim(0, 1) ax.grid() ax.legend(loc = 'best', fontsize = 20) plt.show() return
def get_sa(request): dirname = os.path.dirname(os.path.dirname(__file__)).replace('\\', '/') kmffile = '/images/test1.jpg' naffile = '/images/test2.jpg' context = {} context['kmf'] = kmffile context['naf'] = naffile if not os.path.exists(dirname + kmffile) and not os.path.exists(dirname + naffile): df = load_waltons() T = df['T'] # an array of durations E = df['E'] # a either boolean or binary array representing whether the 'death' was observed (alternatively an individual can be censored) kmf = KaplanMeierFitter(alpha=0.95) kmf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None) naf = NelsonAalenFitter(alpha=0.95, nelson_aalen_smoothing=True) naf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None) kmf.plot() plt.savefig(dirname + kmffile) naf.plot() plt.savefig(dirname + naffile) # return render_to_response(template_name='sa_test.html', context=context, context_instance=RequestContext(request=request)) return render(request=request, template_name='sa_test.html', context=context)
def survival(time, status, pGroups=None): kmf = KaplanMeierFitter() if pGroups is None: order = [i for i in range(2, len(time)) if time[i] != "" and status[i] != ""] t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] kmf.fit(t, s) ax = kmf.plot(color='red') return ax else: ax = None groups = [ "" for i in time] for k in range(len(pGroups)): df = pd.DataFrame() order = [i for i in pGroups[k][2] if time[i] != "" and status[i] != ""] if len(order) <= 0: continue for i in order: groups[i] = k t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] kmf.fit(t, s, label = pGroups[k][0]) if ax is None: ax = kmf.plot(color=pGroups[k][1], ci_show=False, show_censors=True) else: ax = kmf.plot(ax = ax, color=pGroups[k][1], ci_show=False, show_censors=True) order = [i for i in range(len(groups)) if groups[i] != ""] if len(order) > 0: t = [float(time[i]) for i in order] s = [int(status[i]) for i in order] g = [int(groups[i]) for i in order] from lifelines.statistics import multivariate_logrank_test from matplotlib.legend import Legend res = multivariate_logrank_test(t, g, s) leg = Legend(ax, [], [], title = "p = %.2g" % res.p_value, loc='lower left', frameon=False) ax.add_artist(leg); return ax
import pandas as pd from lifelines import KaplanMeierFitter from matplotlib import pyplot as plt time_day_life = [150, 130, 300, 100, 80, 60, 270, 150, 82, 50] tag_sale = [1, 0, 0, 1, 0, 1, 0, 1, 0, 1] df = pd.DataFrame({'time_day_life': time_day_life, 'tag_sale': tag_sale}) df.sort_values('time_day_life', ascending=True) print('Descriptive') print(df.time_day_life.mean()) print('') print(df.groupby('tag_sale').agg('mean')) ## Observamos que el tiempo de vida medio no es comparable entre los leads vendidos y no vendidso con respecto al promedio general #Calculo del estadistico Kaplan-Meier # Curva Kaplan-Meier kmf = KaplanMeierFitter() kmf.fit(durations=df.time_day_life, event_observed=df.tag_sale) kmf.survival_function_ # Plot survival analysis kmf.plot(label='Kaplan-Meier', figsize=(12, 12), show_censors=True, at_risk_counts=True) plt.xlabel('tiempo de vida inmueble en dias', size=15) plt.ylabel('Sobrevida - $P(T>t)$', size=15)
def qq_plot(model, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: axis object Examples --------- >>> from lifelines import * >>> from lifelines.plotting import qq_plot >>> from lifelines.datasets import load_rossi >>> df = load_rossi() >>> wf = WeibullFitter().fit(df['week'], df['arrest']) >>> qq_plot(wf) """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter set_kwargs_ax(plot_kwargs) ax = plot_kwargs.pop("ax") dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_interval_censoring(model): raise NotImplementedError() q = np.unique(kmf.cumulative_density_.values[:, 0]) quantiles = qth_survival_times(q, kmf.cumulative_density_, cdf=True) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
def CoxRegressionModel(stimes, N, ap_s, ap_s2, ap_s3, ap_s5, ebb_, eap_, ebs, aps_0, mu, stime, Np): #ebs = singular value = 0, 0.175, 0.35, 0.525, 0.7 #stimes = 1050 values ( 5 eb values x 7 ap values x 30 survival times) #stime = 30 values for each aps_) value #N = 1050 #Np = 30 #ap_s, ap_s2, ap_s3 = array of 1050 values used for the data frame #*************************************EVENT OBSERVATION**************************************************# E = np.zeros(N).astype(int) for i, time in enumerate(stimes): if (time > 62831): E[i] = 0 else: E[i] = 1 #**************************************MAKING A DATA FRAME************************************************# data1 = { 'T': stimes, 'E': E, 'aps': ap_s, 'aps2': ap_s2, 'aps3': ap_s3, 'aps5': ap_s5, 'eap': eap_, 'eb': ebb_ } df = pd.DataFrame(data=data1) T = df['T'] E = df['E'] aps = df['aps'] aps2 = df['aps2'] aps3 = df['aps3'] aps5 = df['aps5'] eap = df['eap'] eb = df['eb'] #print(df) #************************************COX PH FITTER*************************************# fig, axes = plt.subplots() axes.set_xscale('log') axes.set_ylabel("S(t)") KT, KE, Kdf = PlottingLL.PlottingLL(ebs, aps_0, mu, stime, Np) kmf = KaplanMeierFitter().fit(KT, KE, label='KaplanMeierFitter') kmf.plot_survival_function(ax=axes) cph = CoxPHFitter() #cph.fit(df,duration_col = 'T', event_col = 'E', formula = "eap") #cph.fit(df,duration_col = 'T', event_col = 'E') cph.fit(df, duration_col='T', event_col='E', formula="aps + I(aps**3)") #cph.print_summary() cph.plot_partial_effects_on_outcome(plot_baseline=False, ax=axes, cmap="coolwarm") #cph.plot_partial_effects_on_outcome(covariates = ['aps'], values = [round(aps_0,3)], plot_baseline = False, ax = axes, cmap = "coolwarm") cph.baseline_survival_.plot(ax=axes, ls=":", color=f"C{i}") #cph.fit(df,duration_col = 'T', event_col = 'E', formula = "eb + aps + I(aps**3)") #cph.print_summary() #cph.plot_partial_effects_on_outcome(covariates = ['aps'], values = [round(aps_0,3)], ax = axes) plt.title('Formula(aps vs. aps3 (1050 values)): (eb,ap,mu)={}'.format( (round(ebs, 3), round(aps_0, 3), mu)), fontsize=12)
import pandas as pd from lifelines import KaplanMeierFitter from lifelines.utils import datetimes_to_durations from matplotlib import pyplot as plt data = pd.read_csv('durations.csv') data['Duration'] = data['Duration'] / (60 * 60 * 24) data = data[data['Duration'] < 2500] company1 = (data['Company'] == 'sulake') company2 = (data['Company'] == 'paypal') company3 = (data['Company'] == 'alibaba') kmf = KaplanMeierFitter() kmf.fit(data[company1]['Duration'], data[company1]['Observed']) ax = kmf.plot() kmf.fit(data[company2]['Duration'], data[company2]['Observed']) ax = kmf.plot(ax=ax) kmf.fit(data[company3]['Duration'], data[company3]['Observed']) ax = kmf.plot(ax=ax) plt.show()
duration = [] observed = [] group = [] for elem in after_users.find(): #if elem['duration'] >=1500000: duration.append(elem['duration']/86400) observed.append(elem['observed']) group.append(elem['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj,index=group_obj) DataFrame(obs_obj,index=group_obj) male = group_obj ==1 female = group_obj ==2 other = group_obj ==0 kmf = KaplanMeierFitter() kmf.fit(dura_obj[male],obs_obj[male], label = 'male') ax = kmf.plot() kmf.fit(dura_obj[female],obs_obj[female], label = 'female') kmf.plot(ax=ax) kmf.fit(dura_obj,obs_obj, label = 'both') kmf.plot(ax=ax) #kmf.fit(dura_obj[other],obs_obj[other], label = 'other') #kmf.plot(ax=ax) #ax.set_xlim(19,22) #ax.set_ylim(1,2) ax.get_figure().savefig('maleAndFemale_both_17day')
def __init__(self, db, male=False, female=False, other=False, both=True): self.db = db self.male = male self.female = female self.other = other self.both = both duration = [] observed = [] group = [] for elem in self.db.find(): duration.append(elem['duration'] / 86400) observed.append(elem['observed']) group.append(elem['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj, index=group_obj) DataFrame(obs_obj, index=group_obj) male = group_obj == 1 female = group_obj == 2 other = group_obj == 0 kmf = KaplanMeierFitter() kmf.fit(dura_obj, obs_obj, label='both') ax = kmf.plot() if self.male is True: kmf.fit(dura_obj[male], obs_obj[male], label='male') kmf.plot(ax=ax) if self.female is True: kmf.fit(dura_obj[female], obs_obj[female], label='female') kmf.plot(ax=ax) if self.other is True: kmf.fit(dura_obj[other], obs_obj[other], label='other') kmf.plot(ax=ax) # ax.set_xlim(19,22) # ax.set_ylim(1,2) ax.get_figure().savefig('maleAndFemale')
def data_fit(self): user_list = [] self.hyd_events.create_index('FromUserName') self.hyd_events.create_index('Event') self.hyd_users.create_index('openid') for elem in self.hyd_events.find({'Event': 'subscribe'}): user_list.append(elem['FromUserName']) user_list = list(set(user_list)) print len(user_list) now_time = time.time() # add subscribe time # three tag: pic, text, event # format: 'user_id':'', 'sub_time':'', 'unsub_time':'', 'event':''. duration = [] observed = [] group = [] time_block = [] for elem in user_list: user_dict = {} for item in self.hyd_events.find({'FromUserName': elem}): time_block.append(item['CreateTime']) earlist = min(time_block) latest = max(time_block) sub_time = int(earlist) curt = self.hyd_events.find_one({'$and': [{'FromUserName': elem}, {'Event': 'unsubscribe'}]}) if curt is None: unsub_time = int(now_time) user_dict['observed'] = 0 else: unsub_time = int(latest) user_dict['observed'] = 1 try: user_dict['duration'] = abs(unsub_time - sub_time) except Exception, e: print e print unsub_time print sub_time check = self.hyd_users.find_one({'openid': elem}) # if gender exists, set it, if not, set gender=0, which means gender unknow try: user_dict['gender'] = check['sex'] except TypeError: user_dict['gender'] = 0 duration.append(user_dict['duration'] / 86400) observed.append(user_dict['observed']) group.append(user_dict['gender']) dura_obj = array(duration) obs_obj = array(observed) group_obj = array(group) DataFrame(dura_obj, index=group_obj) DataFrame(obs_obj, index=group_obj) male = group_obj == 1 female = group_obj == 2 other = group_obj == 0 kmf = KaplanMeierFitter() kmf.fit(dura_obj, obs_obj, label='both') ax = kmf.plot() ax.get_figure().savefig('maleAndFemale')
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- .. code:: python from lifelines import * from lifelines.plotting import qq_plot from lifelines.datasets import load_rossi df = load_rossi() wf = WeibullFitter().fit(df['week'], df['arrest']) qq_plot(wf) Notes ------ The interval censoring case uses the mean between the upper and lower bounds. """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[ COL_EMP] elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP, weights=model.weights, entry=model.entry) sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[ COL_EMP] elif CensoringType.is_interval_censoring(model): kmf = KaplanMeierFitter().fit_interval_censoring(model.lower_bound, model.upper_bound, label=COL_EMP, weights=model.weights, entry=model.entry) sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[ COL_EMP + "_lower"] q = np.unique(cdf.values) quantiles = qth_survival_times(1 - q, sf) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
for row in df['vital_status']: if row not in ['Alive', 'Dead']: vital_status.append(None) else: vital_status.append(row) df['vital_status'] = vital_status df['SARS'] = df['SARS'].dropna() df = df[pd.notnull(df['duration'])] df = df[pd.notnull(df['SARS'])] df = df[pd.notnull(df['vital_status'])] lst = df['SARS'].tolist() q1 = np.percentile(lst, 33.33) q2 = np.percentile(lst, 66.66) df1 = df[df['SARS'] <= q1] df2 = df[(df['SARS'] > q1) & (df['SARS'] <= q2)] df3 = df[df['SARS'] > q2] plot_km(df, ax, '', file, "q1") ax.get_figure().savefig(result_dir + file + '_kmplot(samples=' + str(len(df.index)) + ').png') if __name__ == '__main__': for (dirpaths, dirnames, filenames) in os.walk(src_dir): for file in filenames: kmf = KaplanMeierFitter() ax = plt.subplot(111) print(file) df = pd.read_table(src_dir + file, sep=',', header=1) process_df(df, file, ax) plt.clf()
def test_kmf_with_inverted_axis(self, block, kmf): T = np.random.exponential(size=100) kmf = KaplanMeierFitter() kmf.fit(T, label="t2") ax = kmf.plot(invert_y_axis=True, at_risk_counts=True) T = np.random.exponential(3, size=100) kmf = KaplanMeierFitter() kmf.fit(T, label="t1") kmf.plot(invert_y_axis=True, ax=ax, ci_force_lines=False) self.plt.title("test_kmf_with_inverted_axis") self.plt.show(block=block)
# Import library from lifelines.datasets import load_waltons # Load data frame df = load_waltons() # Print dataframe print (df.head()) # Get separare frame for event and time T = df['T'] E = df['E'] from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # more succiently, kmf.fit(T,E) kmf.survival_function_ kmf.median_ kmf.plot() # Multiple groups groups = df['group'] ix = (groups == 'miR-137') kmf.fit(T[~ix], E[~ix], label='control')
#Griffin Calme #Group 15, week 8 activity #Kaplan Meier survival curve import pandas as pd from lifelines import KaplanMeierFitter import matplotlib.pyplot as plt kmf = KaplanMeierFitter() df = pd.DataFrame.from_csv('wk8gp15KapMeier.csv') print(df) groups = df['Group'] ix = (groups == 2) T = df['SERIAL TIME (years)'] E = df['STATUS'] kmf.fit(T[~ix], E[~ix], label='1') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='2') kmf.plot(ax=ax, ci_force_lines=False) plt.show()
print "[***] K-M Estimator" EPS_LIST = [0.05,0.1,0.2,0.4,0.8,1.6] bins0 = config.BIN0 bins1 = config.BIN1 df = pd.read_stata("wichert.dta") data_ = zip(df.time/max(df.time), df.event.astype(int)) data = [(a, b) for (a,b) in data_ if a >= config.GAMMA] print("[*] Remove #%d outliers" % (len(data_) - len(data))) N = len(df) # number of data points kmf = KaplanMeierFitter() (T, E) = zip(*data) kmf.fit(T, event_observed=E) #naf = NelsonAalenFitter() #naf.fit(T, event_observed=E) #ax = pyplot.subplot(121) #naf.plot(ax=ax) #ax = pyplot.subplot(122) #kmf.plot(ax=ax) true_value = kmf.survival_function_.values #naf.cumulative_hazard_.to_csv("naf.csv") #pyplot.show()
def multivariate_logrank_test( event_durations, groups, event_observed=None, t_0=-1, weightings=None, **kwargs) -> StatisticalResult: # pylint: disable=too-many-locals r""" This test is a generalization of the logrank_test: it can deal with n>2 populations (and should be equal when n=2): .. math:: \begin{align} & H_0: h_1(t) = h_2(t) = h_3(t) = ... = h_n(t) \\ & H_A: \text{there exist at least one group that differs from the other.} \end{align} Parameters ---------- event_durations: iterable a (n,) list-like representing the (possibly partial) durations of all individuals groups: iterable a (n,) list-like of unique group labels for each individual. event_observed: iterable, optional a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed. t_0: float, optional (default=-1) the period under observation, -1 for all time. weightings: str, optional apply a weighted logrank test: options are "wilcoxon" for Wilcoxon (also known as Breslow), "tarone-ware" for Tarone-Ware, "peto" for Peto test and "fleming-harrington" for Fleming-Harrington test. These are useful for testing for early or late differences in the survival curve. For the Fleming-Harrington test, keyword arguments p and q must also be provided with non-negative values. Weightings are applied at the ith ordered failure time, :math:`t_{i}`, according to: Wilcoxon: :math:`n_i` Tarone-Ware: :math:`\sqrt{n_i}` Peto: :math:`\bar{S}(t_i)` Fleming-Harrington: :math:`\hat{S}(t_i)^p \times (1 - \hat{S}(t_i))^q` where :math:`n_i` is the number at risk just prior to time :math:`t_{i}`, :math:`\bar{S}(t_i)` is Peto-Peto's modified survival estimate and :math:`\hat{S}(t_i)` is the left-continuous Kaplan-Meier survival estimate at time :math:`t_{i}`. kwargs: add keywords and meta-data to the experiment summary. Returns ------- StatisticalResult a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary`` Examples -------- .. code:: python df = pd.DataFrame({ 'durations': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], 'events': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], 'groups': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2] }) result = multivariate_logrank_test(df['durations'], df['groups'], df['events']) result.test_statistic result.p_value result.print_summary() # numpy example G = [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2] T = [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7] E = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0] result = multivariate_logrank_test(T, G, E) result.test_statistic See Also -------- pairwise_logrank_test logrank_test """ kwargs.setdefault("test_name", "multivariate_logrank_test") event_durations, groups = np.asarray(event_durations), np.asarray(groups) if event_observed is None: event_observed = np.ones((event_durations.shape[0], 1)) else: event_observed = np.asarray(event_observed) n = np.max(event_durations.shape) assert n == np.max(event_durations.shape) == np.max( event_observed.shape), "inputs must be of the same length." groups, event_durations, event_observed = map( lambda x: pd.Series(np.asarray(x).reshape(n)), [groups, event_durations, event_observed]) unique_groups, rm, obs, _ = group_survival_table_from_events( groups, event_durations, event_observed, limit=t_0) n_groups = unique_groups.shape[0] # compute the factors needed n_ij = rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0) d_i = obs.sum(1) n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0) ev_i = n_ij.mul(d_i / n_i, axis="index") # compute weightings for log-rank alternatives if weightings is None: w_i = np.ones(d_i.shape[0]) elif weightings == "wilcoxon": kwargs["test_name"] = kwargs["test_name"].replace( "logrank", "Wilcoxon") w_i = n_i elif weightings == "tarone-ware": kwargs["test_name"] = kwargs["test_name"].replace( "logrank", "Tarone-Ware") w_i = np.sqrt(n_i) elif weightings == "peto": kwargs["test_name"] = kwargs["test_name"].replace("logrank", "Peto") w_i = np.cumprod(1.0 - (ev_i.sum(1)) / (n_i + 1)) # Peto-Peto's modified survival estimates. elif weightings == "fleming-harrington": if "p" in kwargs: p = kwargs["p"] if p < 0: raise ValueError("p must be non-negative.") else: raise ValueError( "Must provide keyword argument p for Flemington-Harrington test statistic" ) if "q" in kwargs: q = kwargs["q"] if q < 0: raise ValueError("q must be non-negative.") else: raise ValueError( "Must provide keyword argument q for Flemington-Harrington test statistic" ) kwargs["test_name"] = kwargs["test_name"].replace( "logrank", "Flemington-Harrington") kmf = KaplanMeierFitter().fit(event_durations, event_observed=event_observed) s = kmf.survival_function_.to_numpy().flatten( )[:-1] # Left-continuous Kaplan-Meier survival estimate. w_i = np.power(s, p) * np.power(1.0 - s, q) else: raise ValueError("Invalid value for weightings.") # apply weights to observed and expected N_j = obs.mul(w_i, axis=0).sum(0).values ev = ev_i.mul(w_i, axis=0).sum(0) # vector of observed minus expected Z_j = N_j - ev assert abs(Z_j.sum( )) < 10e-8, "Sum is not zero." # this should move to a test eventually. # compute covariance matrix factor = (((n_i - d_i) / (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2 n_ij["_"] = n_i.values V_ = (n_ij.mul(w_i, axis=0)).mul(np.sqrt(factor), axis="index").fillna(0) # weighted V_ V = -np.dot(V_.T, V_) ix = np.arange(n_groups) V[ix, ix] = V[ix, ix] - V[-1, ix] V = V[:-1, :-1] # take the first n-1 groups U = Z_j.iloc[:-1] @ np.linalg.pinv( V[:-1, :-1]) @ Z_j.iloc[:-1] # Z.T*inv(V)*Z # compute the p-values and tests p_value = _chisq_test_p_value(U, n_groups - 1) return StatisticalResult(p_value, U, t_0=t_0, null_distribution="chi squared", degrees_of_freedom=n_groups - 1, **kwargs)
## Read Data from csv fileName = 'Telco-Customer-Churn.csv' input_df = pd.read_csv(fileName) ## Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data. input_df['Churn'] = input_df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0) ## Convert TotalCharges to numeric # input_df['TotalCharges']=pd.to_numeric(input_df['TotalCharges'],errors='coerce') T = input_df['tenure'] E = input_df['Churn'] # print(T) kmf = KaplanMeierFitter() ## Two Cohorts are compared. # 1. Streaming TV Not Subscribed by users, and Cohort # 2. Streaming TV subscribed by the users. groups = input_df['StreamingTV'] i1 = (groups == 'No' ) ## group i1 , having the pandas series for the 1st cohort i2 = (groups == 'Yes' ) ## group i2 , having the pandas series for the 2nd cohort ## fit the model for 1st cohort kmf.fit(T[i1], E[i1], label='Not Subscribed StreamingTV') a1 = kmf.plot() ## fit the model for 2nd cohort kmf.fit(T[i2], E[i2], label='Subscribed StreamingTV')
gene_pos.append(int(i.split()[1])) num_cols = len(exp_matrix) mid = num_cols//2 for j in range(0, len(gene_pos)): pos = gene_pos[j] gene_exp = [exp_matrix[i][pos] for i in range(0,num_cols)] comb = [[gene_exp[i], tol[i]] for i in range(0, num_cols)] comb = sorted(comb, key=lambda x: x[0]) low = comb[:mid] high = comb[mid:] #l_exp = [low[i][0] for i in range(0,mid)] l_tol = [low[i][1] for i in range(0, len(low))] l_out = [True]*len(l_tol) #h_exp = [high[i][0] for i in range(0,mid)] h_tol = [high[i][1] for i in range(0, len(high))] h_out = [True]*len(h_tol) kp = KaplanMeierFitter() label_low = 'Lower 50% (n = ' + str(len(low)) + ')' label_high = 'Upper 50% (n = ' + str(len(high)) + ')' len_high = len(high) kp.fit(l_tol, l_out, label=label_low) ax = kp.plot() kp.fit(h_tol, h_out, label=label_high) graph = kp.plot(ax=ax) plt.title("Survival Function of %s" %gene_name[j]) graph.get_figure().savefig("%s_survival.png" %gene_name[j])
from lifelines import KaplanMeierFitter import matplotlib.pyplot as plt df = pd.read_csv('joined.csv.bz2', sep=',', compression='bz2', low_memory=False) # strip ' months' in column 'term' df['term'] = df['term'].map(lambda x: int(x.strip(' months'))) # prepare column 'T' for training survival model df['T'] = df['firstMissed'] / df['term'] df.loc[df['loan_status']=='Fully Paid', 'T']=1 # column 'E' seems to be column 'censored' T = df['T'] E = ~df['censored'] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # more succiently, kmf.fit(T,E) kmf.survival_function_ kmf.median_ kmf.plot() plt.show()
def _calibration_curve_ipcw(out, e, t, a, group, eval_time, typ, ret_bins=True, strat='quantile', n_bins=10): """Returns the Calibration curve and the bins given some risk scores. Accepts the output of a trained survival model at a certain evaluation time, the event indicators and protected group membership and outputs an IPCW adjusted calibration curve. Args: out: risk scores P(T>t) issued by a trained survival analysis model (output of fair_survival_analysis.models.predict_survival). e: a numpy vector of indicators specifying is event or censoring occured. t: a numpy vector of times at which the events or censoring occured. a: a numpy vector of protected attributes. group: string indicating the demogrpahic to evaluate calibration for. eval_time: float/int of the event time at which calibration is to be evaluated. Must be same as the time at which the Risk Scores were issues. typ: Determines if the calibration curves are to be computed on the individuals that experienced the event or adjusted estimates for individuals that are censored using IPCW estimator on a population or subgroup level ret_bins: Boolean that specifies if the bins of the calibration curve are to be returned. strat: Specifies how the bins are computed. One of: "quantile": Equal sized bins. "uniform": Uniformly stratified. n_bins: int specifying the number of bins to use to compute the ece. Returns: Calibration Curve: A tuple of True Probality, Estimated Probability in each bin and the estimated Expected Calibration Error. """ if typ == 'IPCWpop': kmf = KaplanMeierFitter().fit(t, 1 - e) else: t_ = t[a == group] e_ = e[a == group] kmf = KaplanMeierFitter().fit(t_, 1 - e_) out_ = out.copy() e = e[a == group] t = t[a == group] out = out[a == group] y = t > eval_time if strat == 'quantile': quantiles = [(1. / n_bins) * i for i in range(n_bins + 1)] outbins = np.quantile(out, quantiles) if strat == 'uniform': binlen = (out.max() - out.min()) / n_bins outbins = [out.min() + i * binlen for i in range(n_bins + 1)] prob_true = [] prob_pred = [] ece = 0 for n_bin in range(n_bins): binmin = outbins[n_bin] binmax = outbins[n_bin + 1] scorebin = (out >= binmin) & (out <= binmax) weight = float(len(scorebin)) / len(out) out_ = out[scorebin] y_ = y[scorebin] y_ = y_ / kmf.predict(eval_time) pred = y_.mean() prob_true.append(pred) prob_pred.append(out_.mean()) gap = abs(prob_pred[-1] - prob_true[-1]) ece += weight * gap if ret_bins: return prob_true, prob_pred, outbins, ece else: return prob_true, prob_pred, ece
def plot_survival(unique_groups, grouped_data, analysis_type, censors, ci, showplot, stat_results, time='Months'): #plot survival curve kmf = KaplanMeierFitter() fig, ax = plt.subplots() n_in_groups = [] f = open('Kaplan_%s.txt' % (analysis_type), 'a') f.write("\nPercent %s\n" % analysis_type) headers = "Group\t" for x in range(95,-1,-5): headers += str(x) + "%\t" f.write("%s\n" % headers) for i, group in enumerate(unique_groups): data = grouped_data.get_group(group) n_in_groups.append(len(data)) # Adjust survival data from days to whatever form wanted if time.lower() == 'months': survival_time = (data['survival']/(365/12)) elif time.lower() == 'years': survival_time = (data['survival']/(365)) else: survival_time = data['survival'] kmf.fit(survival_time, data['event'], label = group) # print(data[survival]) # print(kmf.survival_function_) f.write("%s\t" % group) for x in range(95, -1, -5): f.write(str(qth_survival_times(x/100, kmf.survival_function_)) + "\t") f.write("\n") kmf.plot(ax=ax, show_censors=censors, ci_show=ci, linewidth=2.5) # Make the graph pretty! textbox = dict(horizontalalignment = 'left', verticalalignment = 'bottom', fontname = 'Arial', fontsize = 18) labels = dict(horizontalalignment = 'center', verticalalignment = 'center', fontname = 'Arial', fontsize = 28) ax.grid(False) ax.set_ylim(0,1.05) ax.spines['left'].set_linewidth(2.5) ax.spines['right'].set_linewidth(2.5) ax.spines['top'].set_linewidth(2.5) ax.spines['bottom'].set_linewidth(2.5) ax.yaxis.set_tick_params(width=2.5) ax.xaxis.set_tick_params(width=2.5) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') # plt.title('%s' % (analysis_type), labels, y = 1.05) plt.xlabel('%s Post-Diagnosis' % time, labels, labelpad = 20) if analysis_type == 'survival': plt.ylabel('Overall Survival', labels, labelpad = 20) else: plt.ylabel('Relapse-Free Survival', labels, labelpad=20) plt.xticks(fontname = 'Arial', fontsize = 24) plt.yticks(fontname = 'Arial', fontsize = 24) ax.tick_params(axis='y', pad=10) ax.tick_params(axis='x', pad=10) legend = ax.legend(frameon=False,loc=3) counter=0 for label in legend.get_texts(): label.set_fontsize(20) label.set_text('%s n=%d' % (unique_groups[counter], n_in_groups[counter])) counter += 1 if len(unique_groups) == 2: plt.text(0.95, 0.05, 'p = %.2g' % (stat_results.p_value), fontname='Arial', fontsize=20, ha='right', transform=ax.transAxes) plt.tight_layout() fig.savefig('Kaplan_%s.png' % analysis_type, transparent = True) fig.savefig('Kaplan_%s.eps' % analysis_type, transparent = True) if showplot == True: plt.show() plt.close(fig)
lat_0 = clin[clin['latitude_raw'] == 0] lat_1 = clin[clin['latitude_raw'] == 1] lat_23 = clin[clin['latitude_raw'] >= 2] # ============================================================================= # Prepare plots # ============================================================================= fig1, ax = plt.subplots(1, figsize=(2.5,2.5)) # ============================================================================= # Plot ctDNA below median on kmf1 # ============================================================================= kmf1 = KaplanMeierFitter() color = 'grey' defective_patient_number = str(len(lat_0)) defective_label = str(str('Lat. 0 ')+r"(n="+defective_patient_number+")") T = lat_0['Time_to_CRPC'].round(3) C = lat_0['crpc_status'].astype(np.int32) kmf1.fit(T, event_observed = C, label = defective_label) kmf1.plot(ax=ax,show_censors = True, ci_show = False, color = color, lw = 1) # lat_0_median=kmf1.median # ============================================================================= # Plot ctDNA above median on kmf2 # ============================================================================= kmf2 = KaplanMeierFitter()
import pandas as pd from pandas import DataFrame, Series import numpy as np import scipy import lifelines figsize(12.5,5) np.set_printoptions(precision=2, suppress=True) from lifelines import KaplanMeierFitter survival_times = np.array([0.,3.,4.5, 10., 1.]) events = np.array([False, True, True, False, True]) kmf = KaplanMeierFitter() kmf.fit(survival_times, event_observed=events) print kmf.survival_function_ print kmf.median_ kmf.plot() ## example 2 import matplotlib.pylab as plt %pylab figsize(12.5,6) from lifelines.plotting import plot_lifetimes from numpy.random import uniform, exponential N = 25
# Loading the the survival un-employment data Patient_data = pd.read_csv("C:/Users/hp/Desktop/survival assi/Patient.csv") Patient_data.head() Patient_data.describe() Patient_data["Followup"].describe() # Followup is referring to time T = Patient_data.Followup # Importing the KaplanMeierFitter model to fit the survival analysis from lifelines import KaplanMeierFitter # Initiating the KaplanMeierFitter model kmf = KaplanMeierFitter() # Fitting KaplanMeierFitter model on Time and Events for death kmf.fit(T, event_observed=Patient_data.Eventtype) # Time-line estimations plot kmf.plot() Patient_data.PatientID.value_counts() # Applying KaplanMeierFitter model on Time and Events kmf.fit( T[Patient_data.PatientID == [ 'Joe', 'Jess', 'Ann', 'Mary', 'Frank', 'Steven', 'Andy', 'Elizabeth', 'Joe', 'Kate' ]], Patient_data.Eventtype[Patient_data.PatientID == [ 'Joe', 'Jess', 'Ann', 'Mary', 'Frank', 'Steven', 'Andy', 'Elizabeth',
def make_figure(df, pa): df_ls = df.copy() durations = df_ls[pa["xvals"]] event_observed = df_ls[pa["yvals"]] km = KaplanMeierFitter() ## instantiate the class to create an object pl = None fig = plt.figure(frameon=False, figsize=(float(pa["fig_width"]), float(pa["fig_height"]))) ## Fit the data into the model if str(pa["groups_value"]) == "None": km.fit(durations, event_observed, label='Kaplan Meier Estimate') df_survival = km.survival_function_ df_conf = km.confidence_interval_ df_event = km.event_table df = pd.merge(df_survival, df_conf, how='left', left_index=True, right_index=True) df = pd.merge(df, df_event, how='left', left_index=True, right_index=True) df['time'] = df.index.tolist() df = df.reset_index(drop=True) df = df[[ "time", "at_risk", "removed", "observed", "censored", "entrance", "Kaplan Meier Estimate", "Kaplan Meier Estimate_lower_0.95", "Kaplan Meier Estimate_upper_0.95" ]] pa_ = {} for arg in [ "Conf_Interval", "show_censors", "ci_legend", "ci_force_lines", "left_axis", "right_axis", "upper_axis", "lower_axis", "tick_left_axis", "tick_right_axis", "tick_upper_axis", "tick_lower_axis" ]: if pa[arg] in ["off", ".off"]: pa_[arg] = False else: pa_[arg] = True if str(pa["markerc_write"]) != "": pa_["marker_fc"] = pa["markerc_write"] else: pa_["marker_fc"] = pa["markerc"] if str(pa["edgecolor_write"]) != "": pa_["marker_ec"] = pa["edgecolor_write"] else: pa_["marker_ec"] = pa["edgecolor"] if str(pa["grid_color_text"]) != "": pa_["grid_color_write"] = pa["grid_color_text"] else: pa_["grid_color_write"] = pa["grid_color_value"] pl=km.plot(show_censors=pa_["show_censors"], \ censor_styles={"marker":marker_dict[pa["censor_marker_value"]], "markersize":float(pa["censor_marker_size_val"]), "markeredgecolor":pa_["marker_ec"], "markerfacecolor":pa_["marker_fc"], "alpha":float(pa["marker_alpha"])}, \ ci_alpha=float(pa["ci_alpha"]), \ ci_force_lines=pa_["ci_force_lines"], \ ci_show=pa_["Conf_Interval"], \ ci_legend=pa_["ci_legend"], \ linestyle=pa["linestyle_value"], \ linewidth=float(pa["linewidth"]), \ color=pa["line_color_value"]) pl.spines['right'].set_visible(pa_["right_axis"]) pl.spines['top'].set_visible(pa_["upper_axis"]) pl.spines['left'].set_visible(pa_["left_axis"]) pl.spines['bottom'].set_visible(pa_["lower_axis"]) pl.spines['right'].set_linewidth(pa["axis_line_width"]) pl.spines['left'].set_linewidth(pa["axis_line_width"]) pl.spines['top'].set_linewidth(pa["axis_line_width"]) pl.spines['bottom'].set_linewidth(pa["axis_line_width"]) pl.tick_params(axis="both", direction=pa["ticks_direction_value"], length=float(pa["ticks_length"])) pl.tick_params(axis='x', which='both', bottom=pa_["tick_lower_axis"], top=pa_["tick_upper_axis"], labelbottom=pa_["lower_axis"], labelrotation=float(pa["xticks_rotation"]), labelsize=float(pa["xticks_fontsize"])) pl.tick_params(axis='y', which='both', left=pa_["tick_left_axis"], right=pa_["tick_right_axis"], labelleft=pa_["left_axis"], labelrotation=float(pa["yticks_rotation"]), labelsize=float(pa["yticks_fontsize"])) if str(pa["grid_value"]) != "None": pl.grid(True, which='both', axis=pa["grid_value"], color=pa_["grid_color_write"], linewidth=float(pa["grid_linewidth"])) if str(pa["x_lower_limit"]) != "" and str(pa["x_upper_limit"]) != "": pl.set_xlim(float(pa["x_lower_limit"]), float(pa["x_upper_limit"])) if str(pa["y_lower_limit"]) != "" and str(pa["y_upper_limit"]) != "": pl.set_ylim(float(pa["y_lower_limit"]), float(pa["y_upper_limit"])) pl.set_title(pa["title"], fontdict={'fontsize': float(pa['titles'])}) pl.set_xlabel(pa["xlabel"], fontdict={'fontsize': float(pa['xlabels'])}) pl.set_ylabel(pa["ylabel"], fontdict={'fontsize': float(pa['ylabels'])}) return df, pl elif str(pa["groups_value"]) != "None": df_long = pd.DataFrame( columns=['day', 'status', str(pa["groups_value"])]) for row in range(0, len(df_ls)): if int(df_ls.loc[row, pa["yvals"]]) >= 1: dead = int(df_ls.loc[row, pa["yvals"]]) #print(dead) for i in range(0, dead): #print(i) df_long = df_long.append( { 'day': int(df_ls.loc[row, pa["xvals"]]), 'status': 1, str(pa["groups_value"]): str(df_ls.loc[row, pa["groups_value"]]) }, ignore_index=True) i = i + 1 elif int(df_ls.loc[row, pa["censors_val"]]) >= 1: censored = int(df_ls.loc[row, pa["censors_val"]]) #print(censored) for c in range(0, censored): #print(c) df_long = df_long.append( { 'day': int(df_ls.loc[row, pa["xvals"]]), 'status': 0, str(pa["groups_value"]): str(df_ls.loc[row, pa["groups_value"]]) }, ignore_index=True) c = c + 1 df_dummy = pd.get_dummies(df_long, drop_first=True, columns=[pa["groups_value"]]) results = logrank_test(df_dummy.loc[df_dummy['status'] == 1, 'day'].tolist(), df_dummy.loc[df_dummy['status'] == 0, 'day'].tolist(), df_dummy.loc[df_dummy['status'] == 1, 'status'].tolist(), df_dummy.loc[df_dummy['status'] == 0, 'status'].tolist(), alpha=.99) cph = CoxPHFitter() cph.fit(df_dummy, duration_col='day', event_col='status') cph_coeff = cph.summary cph_coeff = cph_coeff.reset_index() df_info = {} df_info['model'] = 'lifelines.CoxPHFitter' df_info['duration col'] = cph.duration_col df_info['event col'] = cph.event_col df_info['baseline estimation'] = 'breslow' df_info['number of observations'] = cph._n_examples df_info['number of events observed'] = len( df_dummy.loc[df_dummy['status'] == 1, ]) df_info['partial log-likelihood'] = cph.log_likelihood_ df_info['Concordance'] = cph.concordance_index_ df_info['Partial AIC'] = cph.AIC_partial_ df_info['log-likelihood ratio test'] = cph.log_likelihood_ratio_test( ).test_statistic df_info[ 'P.value(log-likelihood ratio test)'] = cph.log_likelihood_ratio_test( ).p_value df_info['log rank test'] = results.test_statistic df_info['P.value(log rank test)'] = results.p_value cph_stats = pd.DataFrame(df_info.items()) cph_stats = cph_stats.rename(columns={0: 'Statistic', 1: 'Value'}) #cph_stats tmp = [] for cond in pa["list_of_groups"]: df_tmp = df_ls.loc[df_ls[pa["groups_value"]] == cond] km.fit(df_tmp[pa["xvals"]], df_tmp[pa["yvals"]], label=cond) df_survival = km.survival_function_ df_conf = km.confidence_interval_ df_event = km.event_table df = pd.merge(df_survival, df_conf, how='left', left_index=True, right_index=True) df = pd.merge(df, df_event, how='left', left_index=True, right_index=True) df['time'] = df.index.tolist() df = df.reset_index(drop=True) df = df.rename( columns={ "at_risk": cond + "_at_risk", "removed": cond + "_removed", "observed": cond + "_observed", "censored": cond + "_censored", "entrance": cond + "_entrance", cond: cond + "_KMestimate" }) df = df[[ "time", cond + "_at_risk", cond + "_removed", cond + "_observed", cond + "_censored", cond + "_entrance", cond + "_KMestimate", cond + "_lower_0.95", cond + "_upper_0.95" ]] tmp.append(df) df = reduce(lambda df1, df2: pd.merge(df1, df2, on='time'), tmp) PA_ = [g for g in pa["groups_settings"] if g["name"] == cond][0] if str(PA_["linecolor_write"]) != "": linecolor = PA_["linecolor_write"] else: linecolor = PA_["line_color_value"] if str(PA_["linestyle_write"]) != "": linestyle = PA_["linestyle_write"] else: linestyle = PA_["linestyle_value"] if str(PA_["markerc_write"]) != "": markerColor = PA_["markerc_write"] else: markerColor = PA_["markerc"] if str(PA_["edgecolor_write"]) != "": edgeColor = PA_["edgecolor_write"] else: edgeColor = PA_["edgecolor"] if PA_["show_censors"] in ["off", ".off"]: showCensors = False else: showCensors = True if PA_["Conf_Interval"] in ["off", ".off"]: ConfidenceInterval = False else: ConfidenceInterval = True if PA_["ci_legend"] in ["off", ".off"]: CI_legend = False else: CI_legend = True if PA_["ci_force_lines"] in ["off", ".off"]: CI_lines = False else: CI_lines = True linewidth = PA_["linewidth_write"] edgeLineWidth = PA_["edge_linewidth"] markerSize = PA_["censor_marker_size_val"] markerAlpha = PA_["marker_alpha"] CI_alpha = PA_["ci_alpha"] markerVal = PA_["censor_marker_value"] pa_ = {} for arg in [ "left_axis", "right_axis", "upper_axis", "lower_axis", "tick_left_axis", "tick_right_axis", "tick_upper_axis", "tick_lower_axis" ]: if pa[arg] in ["off", ".off"]: pa_[arg] = False else: pa_[arg] = True if str(pa["grid_color_text"]) != "": pa_["grid_color_write"] = pa["grid_color_text"] else: pa_["grid_color_write"] = pa["grid_color_value"] pl=km.plot(show_censors=showCensors, \ censor_styles={"marker":marker_dict[markerVal], "markersize":float(markerSize), "markeredgecolor":edgeColor, "markerfacecolor":markerColor, "alpha":float(markerAlpha), "mew":float(edgeLineWidth)}, \ ci_alpha=float(CI_alpha), \ ci_force_lines=CI_lines, \ ci_show=ConfidenceInterval, \ ci_legend=CI_legend, \ linestyle=linestyle, \ linewidth=float(linewidth), \ color=linecolor) pl.spines['right'].set_visible(pa_["right_axis"]) pl.spines['top'].set_visible(pa_["upper_axis"]) pl.spines['left'].set_visible(pa_["left_axis"]) pl.spines['bottom'].set_visible(pa_["lower_axis"]) pl.spines['right'].set_linewidth(pa["axis_line_width"]) pl.spines['left'].set_linewidth(pa["axis_line_width"]) pl.spines['top'].set_linewidth(pa["axis_line_width"]) pl.spines['bottom'].set_linewidth(pa["axis_line_width"]) pl.tick_params(axis="both", direction=pa["ticks_direction_value"], length=float(pa["ticks_length"])) pl.tick_params(axis='x', which='both', bottom=pa_["tick_lower_axis"], top=pa_["tick_upper_axis"], labelbottom=pa_["lower_axis"], labelrotation=float(pa["xticks_rotation"]), labelsize=float(pa["xticks_fontsize"])) pl.tick_params(axis='y', which='both', left=pa_["tick_left_axis"], right=pa_["tick_right_axis"], labelleft=pa_["left_axis"], labelrotation=float(pa["yticks_rotation"]), labelsize=float(pa["yticks_fontsize"])) if str(pa["grid_value"]) != "None": pl.grid(True, which='both', axis=pa["grid_value"], color=pa_["grid_color_write"], linewidth=float(pa["grid_linewidth"])) if str(pa["x_lower_limit"]) != "" and str( pa["x_upper_limit"]) != "": pl.set_xlim(float(pa["x_lower_limit"]), float(pa["x_upper_limit"])) if str(pa["y_lower_limit"]) != "" and str( pa["y_upper_limit"]) != "": pl.set_ylim(float(pa["y_lower_limit"]), float(pa["y_upper_limit"])) pl.set_title(pa["title"], fontdict={'fontsize': float(pa['titles'])}) pl.set_xlabel(pa["xlabel"], fontdict={'fontsize': float(pa['xlabels'])}) pl.set_ylabel(pa["ylabel"], fontdict={'fontsize': float(pa['ylabels'])}) return df, pl, cph_coeff, cph_stats
# Add lab number training_time['lab_number'] = training_time.lab.map(institution_map()[0]) training_time = training_time.sort_values('lab_number') # %% PLOT # Set figure style and color palette use_palette = [[0.6, 0.6, 0.6]] * len(np.unique(training_time['lab'])) use_palette = use_palette + [[1, 1, 0.2]] lab_colors = group_colors() # Plot hazard rate survival analysis f, (ax1) = plt.subplots(1, 1, figsize=(FIGURE_WIDTH/3, FIGURE_HEIGHT)) kmf = KaplanMeierFitter() for i, lab in enumerate(np.unique(training_time['lab_number'])): kmf.fit(training_time.loc[training_time['lab_number'] == lab, 'sessions'].values, event_observed=training_time.loc[training_time['lab_number'] == lab, 'trained']) ax1.step(kmf.cumulative_density_.index.values, kmf.cumulative_density_.values, color=lab_colors[i]) kmf.fit(training_time['sessions'].values, event_observed=training_time['trained']) # ax1.step(kmf.cumulative_density_.index.values, kmf.cumulative_density_.values, color='black') ax1.set(ylabel='Cumulative probability of\nreaching trained criterion', xlabel='Training day', xlim=[0, 60], ylim=[0, 1.02]) ax1.set_title('All labs: %d mice'%training_time['nickname'].nunique()) # kmf.fit(training_time['sessions'].values, event_observed=training_time['trained']) # kmf.plot_cumulative_density(ax=ax2) # ax2.set(ylabel='Cumulative probability of\nreaching trained criterion', xlabel='Training day', # title='All labs', xlim=[0, 60], ylim=[0, 1.02])
#batsmen_data = data #data.to_csv('data.csv', sep=',') #print(batsmen_data) #----------------------------------(i) Player's Country vs Career Length ------------------------------------------------------- data = pd.read_csv("data.csv") data.ix[:, 'censor'] = 1 data = pd.DataFrame(data) duration = data['span'] observed = data.ix[:, 'censor'] kmf = KaplanMeierFitter() kmf.fit(duration, observed, label='kmf_mean') #kmf.plot() #plt.show() ###INDIA kmf india_data = data.ix[data['country'] == 'INDIA'] india_duration = india_data['span'] india_observed = india_data['censor'] kmfind = KaplanMeierFitter() kmfind.fit(india_duration, india_observed, label="india") ###simillarly for other countries kmfpak = KaplanMeierFitter()
def Compute_Pvalue(df, df2, end_date, start_date=0): #df=bio data, #df2=survival data measures_list = measures['measure_key'].tolist() df_stats = pd.DataFrame() df_logrank = pd.DataFrame() df_stats_final = pd.DataFrame() df_logrank_final = pd.DataFrame() for m in measures_list: number = 0 numberp = 0 print(m) if (m != 30): d = filter_data(df, m, 0, end_date, 2, ['1', '2']) #d['patient_key']=d['patient_key'].astype(int) measure_name = measures[measures['measure_key'] == m]['measure_name'] d['measure_name'] = measure_name.values[0] if d.empty == False: #filename='C:/Users/akaic/bio/data_bio_measure_'+str(measure_name.values[0])+'.csv' #Join patients with their survival data data = surviv_data.merge(d, how='inner', on='patient_key') # Create survival model here groups = ['1', '2'] ### #Save infos NbP = 0 try: for g in groups: group1 = data[data['group'] == g] T = group1['Duration'] E = group1['death'] kmf_1 = KaplanMeierFitter().fit(T, E, label="Group " + str(g)) median1 = kmf_1.median_survival_time_ nb1 = data[data['group'] == g].shape[0] df_stats.loc[number, 'group'] = 'group_' + str(g) + '_' + str( measure_name.values[0]) df_stats.loc[number, 'median'] = median1 df_stats.loc[number, 'nombre'] = nb1 df_stats.loc[number, 'measure'] = (measure_name.values[0]) #print(d[1,'breaks'][0]) #df_stats.at[number,'breaks']=str(d[1,'breaks'][0]) number = number + 1 NbP = NbP + nb1 if (NbP >= 20): print('Yes') p_value = multivariate_logrank_test( data['Duration'], data['group'], data['death']).p_value #df_logrank.loc[numberp,'breaks'].applymap(lambda x: d.iloc[1]['breaks']) df_logrank.loc[numberp, 'pvalue'] = p_value df_logrank.loc[numberp, 'measure'] = (measure_name.values[0]) df_logrank.loc[numberp, 'start_date'] = 0 df_logrank.loc[numberp, 'end_date'] = end_date df_stats_final = df_stats_final.append( df_stats, ignore_index=True) df_logrank_final = df_logrank_final.append( df_logrank, ignore_index=True) except: pass return df_stats_final, df_logrank_final
import pandas as pd from lifelines import KaplanMeierFitter from pylab import show df = pd.read_excel('lapse-data-pure.xlsx') df = df[df['Duration Days'] != 0] T = df['Duration Months'].apply(lambda x: 0 if x < 0 else x) E = T.apply(lambda x: True if x > 0 else False) df['Random Class'] = df['Random Class'].apply(lambda x: 'A' if x >= 0.5 else 'B') #df.to_excel('lapse-data-ready.xlsx') groups = df['Random Class'] ix = (groups == 'A') kmf = KaplanMeierFitter() kmf.fit(T[ix], event_observed=E[ix], label='Class A') # or, more succiently, kmf.fit(T, E) ax = kmf.plot() ix = (groups == 'B') kmf.fit(T[ix], event_observed=E[ix], label='Class B') ax = kmf.plot(ax=ax) show()
def survival_difference_at_fixed_point_in_time_test(point_in_time, durations_A, durations_B, event_observed_A=None, event_observed_B=None, **kwargs): """ Often analysts want to compare the survival-ness of groups at specific times, rather than comparing the entire survival curves against each other. For example, analysts may be interested in 5-year survival. Statistically comparing the naive Kaplan-Meier points at a specific time actually has reduced power (see [1]). By transforming the Kaplan-Meier curve, we can recover more power. This function uses the log(-log) transformation. Parameters ---------- point_in_time: float, the point in time to analyze the survival curves at. durations_A: iterable a (n,) list-like of event durations (birth to death,...) for the first population. durations_B: iterable a (n,) list-like of event durations (birth to death,...) for the second population. event_observed_A: iterable, optional a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the first population. Default assumes all observed. event_observed_B: iterable, optional a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the second population. Default assumes all observed. kwargs: add keywords and meta-data to the experiment summary Returns ------- results : StatisticalResult a StatisticalResult object with properties 'p_value', 'summary', 'test_statistic', 'print_summary' Examples -------- >>> T1 = [1, 4, 10, 12, 12, 3, 5.4] >>> E1 = [1, 0, 1, 0, 1, 1, 1] >>> >>> T2 = [4, 5, 7, 11, 14, 20, 8, 8] >>> E2 = [1, 1, 1, 1, 1, 1, 1, 1] >>> >>> from lifelines.statistics import survival_difference_at_fixed_point_in_time_test >>> results = survival_difference_at_fixed_point_in_time_test(12, T1, T2, event_observed_A=E1, event_observed_B=E2) >>> >>> results.print_summary() >>> print(results.p_value) # 0.893 >>> print(results.test_statistic) # 0.017 Notes ----- Other transformations are possible, but Klein et al. [1] showed that the log(-log(c)) transform has the most desirable statistical properties. [1] Klein, J. P., Logan, B. , Harhoff, M. and Andersen, P. K. (2007), Analyzing survival curves at a fixed point in time. Statist. Med., 26: 4505-4519. doi:10.1002/sim.2864 """ kmfA = KaplanMeierFitter().fit(durations_A, event_observed=event_observed_A) kmfB = KaplanMeierFitter().fit(durations_B, event_observed=event_observed_B) sA_t = kmfA.predict(point_in_time) sB_t = kmfB.predict(point_in_time) # this is doing a prediction/interpolation between the kmf's index. sigma_sqA = dataframe_interpolate_at_times(kmfA._cumulative_sq_, point_in_time) sigma_sqB = dataframe_interpolate_at_times(kmfB._cumulative_sq_, point_in_time) log = np.log clog = lambda s: log(-log(s)) X = (clog(sA_t) - clog(sB_t))**2 / (sigma_sqA / log(sA_t)**2 + sigma_sqB / log(sB_t)**2) p_value = chisq_test(X, 1) return StatisticalResult(p_value, X, null_distribution="chi squared", degrees_of_freedom=1, point_in_time=point_in_time, **kwargs)
def main(self, durations: List[pd.DataFrame], categories: List[pd.DataFrame], event_observed: List[pd.DataFrame], estimator: str, id_filter: List[str], subsets: List[List[str]]) -> dict: # TODO: Docstring if len(durations) != 1: error = 'Analysis requires exactly one array that specifies the ' \ 'duration length.' logger.exception(error) raise ValueError(error) if len(event_observed) > 1: error = 'Maximal one variable for "event_observed" allowed' logger.exception(error) raise ValueError(error) df = durations[0] df.dropna(inplace=True) df = utils.apply_id_filter(df=df, id_filter=id_filter) df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_categories(df=df, categories=categories) stats = {} categories = df['category'].unique().tolist() subsets = df['subset'].unique().tolist() # for every category and subset combination estimate the survival fun. for category in categories: for subset in subsets: sub_df = df[(df['category'] == category) & (df['subset'] == subset)] T = sub_df['value'] E = None # default is nothing is censored if len(T) <= 3: continue if event_observed: # find observation boolean value for every duration E = event_observed[0].merge(sub_df, how='right', on='id') E = [not x for x in pd.isnull(E['value_x'])] assert len(E) == len(T) if estimator == 'NelsonAalen': fitter = NelsonAalenFitter() fitter.fit(durations=T, event_observed=E) estimate = fitter.cumulative_hazard_[ 'NA_estimate'].tolist() ci_lower = fitter.confidence_interval_[ 'NA_estimate_lower_0.95'].tolist() ci_upper = fitter.confidence_interval_[ 'NA_estimate_upper_0.95'].tolist() elif estimator == 'KaplanMeier': fitter = KaplanMeierFitter() fitter.fit(durations=T, event_observed=E) # noinspection PyUnresolvedReferences estimate = fitter.survival_function_[ 'KM_estimate'].tolist() ci_lower = fitter.confidence_interval_[ 'KM_estimate_lower_0.95'].tolist() ci_upper = fitter.confidence_interval_[ 'KM_estimate_upper_0.95'].tolist() else: error = 'Unknown estimator: {}'.format(estimator) logger.exception(error) raise ValueError(error) timeline = fitter.timeline.tolist() if not stats.get(category): stats[category] = {} stats[category][subset] = { 'timeline': timeline, 'estimate': estimate, 'ci_lower': ci_lower, 'ci_upper': ci_upper } return { 'label': df['feature'].tolist()[0], 'categories': categories, 'subsets': subsets, 'stats': stats }
for i, t in enumerate(x): y[i] = naive_estimator(t, data) plt.plot(x, y, label="Naive") x, y = HomemadeKM(data) plt.step(x, y, label="Kaplan-Meier") plt.xlabel("Time") plt.ylabel("Survival probability estimate") plt.legend() plt.show() # We want to compare the survival functions of these two groups. # Now, use the `KaplanMeierFitter` class from `lifelines`. Run the next cell to fit and plot the Kaplan Meier curves for each group. S1 = data[data.Stage_group == 1] km1 = KM() km1.fit(S1.loc[:, 'Time'], event_observed=S1.loc[:, 'Event'], label='Stage III') S2 = data[data.Stage_group == 2] km2 = KM() km2.fit(S2.loc[:, "Time"], event_observed=S2.loc[:, 'Event'], label='Stage IV') ax = km1.plot(ci_show=False) km2.plot(ax=ax, ci_show=False) plt.xlabel('time') plt.ylabel('Survival probability estimate') plt.savefig('two_km_curves', dpi=300) # Let's compare the survival functions at 90, 180, 270, and 360 days
}) time_data.to_csv('results/running_time_coxph.csv', index=False) ############################### ##Extra code: calibration plots #Cox model calibration train set y_pred = cph.predict_survival_function(data_train) times = y_pred.index.values y_pred = y_pred.as_matrix().transpose() cuts = np.concatenate( (np.array([-1e6]), np.percentile(y_pred[:, 1], [25, 50, 75]), np.array([1e6]))) bin = pd.cut(y_pred[:, 1], cuts, labels=False) kmf = KaplanMeierFitter() for which_bin in range(max(bin) + 1): kmf.fit(data_train.time.iloc[bin == which_bin], event_observed=data_train.dead.iloc[bin == which_bin]) plt.plot(kmf.survival_function_.index.values, kmf.survival_function_.KM_estimate, color='k') pred_surv = np.mean(y_pred[bin == which_bin, :], axis=0) plt.plot(times, pred_surv, 'b-') plt.xticks(np.arange(0, 365 * 5, 365)) plt.yticks(np.arange(0, 1.0001, 0.125)) plt.xlim([0, 365.25 * 5]) plt.ylim([0, 1]) plt.gca().set_position([0.1, 0.1, .8, .8]) plt.show()
).assign( published_date=lambda x: x.published_date.fillna(date.today())).assign( time_to_published=lambda x: pd.to_datetime( x.published_date) - pd.to_datetime(x.posted_date))) preprints_w_published_dates = preprints_w_published_dates[ preprints_w_published_dates.time_to_published > pd.Timedelta(0)].dropna() print(preprints_w_published_dates.shape) preprints_w_published_dates.head() # # Calculate Overall Survival Function # This section loads up the KaplanMeier Estimator for preprints. It measures the lifetime of unpublished preprints. Overtime preprints start to become published which is what decreases the population size. # In[5]: kmf = KaplanMeierFitter() # In[6]: kmf.fit( preprints_w_published_dates["time_to_published"].dt.total_seconds() / 60 / 60 / 24, event_observed=~preprints_w_published_dates["published_doi"].isna(), ) # In[7]: kmf.median_survival_time_ # In[8]:
def cluster_KMplot(cluster_assign, clin_data_fn, delimiter='\t', lr_test=True, tmax=-1, verbose=True, **save_args): title = 'KM Survival Plot' if 'job_name' in save_args: title = save_args['job_name'] + ' KM Survival Plot' # Initialize KM plotter kmf = KaplanMeierFitter() # Load and format clinical data surv = pd.read_csv(clin_data_fn, sep=delimiter, index_col=0) # Number of clusters clusters = sorted(list(cluster_assign.value_counts().index)) k = len(clusters) # Initialize KM Plot Settings fig = plt.figure(figsize=(10, 7)) ax = plt.subplot(1, 1, 1) colors = sns.color_palette('hls', k) cluster_cmap = {clusters[i]: colors[i] for i in range(k)} # Plot each cluster onto KM Plot for clust in clusters: clust_pats = list(cluster_assign[cluster_assign == clust].index) clust_surv_data = surv.ix[clust_pats].dropna() kmf.fit(clust_surv_data.overall_survival, clust_surv_data.vital_status, label='Group ' + str(clust) + ' (n=' + str(len(clust_surv_data)) + ')') kmf.plot(ax=ax, color=cluster_cmap[clust], ci_show=False) # Set KM plot limits to 5 years and labels # if tmax!=-1: plt.xlim((0, 1825)) plt.xlabel('Time (Days)', fontsize=16) plt.ylabel('Survival Probability', fontsize=16) # Multivariate logrank test if lr_test: cluster_survivals = pd.concat([surv, cluster_assign], axis=1).dropna().astype(int) p = multiv_lr_test(np.array(cluster_survivals.overall_survival), np.array(cluster_survivals[cluster_assign.name]), t_0=tmax, event_observed=np.array( cluster_survivals.vital_status)).p_value if verbose: print 'Multi-Class Log-Rank P:', p plt.title(title + '\np=' + repr(round(p, 4)), fontsize=24, y=1.02) else: plt.title(title, fontsize=24, y=1.02) # Save KM plot if 'outdir' in save_args: if 'job_name' in save_args: save_KMplot_path = save_args['outdir'] + str( save_args['job_name']) + '_KM_plot.png' else: save_KMplot_path = save_args['outdir'] + 'KM_plot.png' plt.savefig(save_KMplot_path, bbox_inches='tight') plt.show() if verbose: print 'Kaplan Meier Plot constructed' if lr_test: return p else: return
''' T E group 0 6 1 miR-137 1 13 1 miR-137 2 13 1 miR-137 3 13 1 miR-137 4 19 1 miR-137 ''' T = df['T'] E = df['E'] groups = df['group'] ix = (groups == 'miR-137') kmf = KaplanMeierFitter() kmf.fit(T[~ix], E[~ix], label='control') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='miR-137') kmf.plot(ax=ax) plt.ylabel('Survival Probability') outFile = 'lifelines_survival.png' ISP_mystyle.showData(outFile) # Compare the two curves results = logrank_test(T[ix], T[~ix], event_observed_A=E[ix],
def get_kmf_fit(qs): t = qs.values_list('days_since_complaint', flat=True) c = qs.values_list('is_closed', flat=True) kmf = KaplanMeierFitter() kmf.fit(t, event_observed=c) return kmf
# We use a Kaplan-Meier estimator or a product limit estimator. Non-parametric statistic to estimate survival from # lifetime data. import pandas as pd from lifelines.datasets import load_dd data = load_dd() data.sample(2) # the boolean columns `observed` refers to whether the death (leaving office) # was observed or not. # 'Observed' then tells us whether or not something is right-censored? # For this example we'll use KaplanMeier but you can also use BreslowFlemingHarringtonFitter, WeibullFitter or # ExponentialFitter from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() # "For this estimation, we need the duration each leader was/has been in office, and whether or not # they were observed to have left office (leaders who died in office or were in office in 2008, the latest date this # data was record at, do not have observed death events)" # How the KaplanMeierFitter works: # # KaplanMeierFitter.fit(durations, event_observed=None, # timeline=None, entry=None, label='KM_estimate', # alpha=None, left_censorship=False, ci_labels=None) # # Parameters: # duration: an array, or pd.Series, of length n -- duration subject was observed for # timeline: return the best estimate at the values in timelines (postively increasing) # event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
def _plot_kmf_single(df, condition_col, survival_col, censor_col, threshold, title, xlabel, ylabel, ax, with_condition_color, no_condition_color, with_condition_label, no_condition_label, color_map, label_map, color_palette, ci_show, print_as_title): """ Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col. All inputs are required - this function is intended to be called by `plot_kmf`. """ # make color inputs consistent hex format if colors.is_color_like(with_condition_color): with_condition_color = colors.to_hex(with_condition_color) if colors.is_color_like(no_condition_color): no_condition_color = colors.to_hex(no_condition_color) ## prepare data to be plotted; producing 3 outputs: # - `condition`, series containing category labels to be plotted # - `label_map` (mapping condition values to plot labels) # - `color_map` (mapping condition values to plotted colors) if threshold is not None: is_median = threshold == "median" if is_median: threshold = df[condition_col].median() label_suffix = float_str(threshold) condition = df[condition_col] > threshold default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix) if is_median: label_suffix += " (median)" default_label_with_condition = "%s > %s" % (condition_col, label_suffix) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category": condition = df[condition_col].astype("category") if not label_map: label_map = dict() [label_map.update({condition_value: '{} = {}'.format(condition_col, condition_value)}) for condition_value in condition.unique()] if not color_map: rgb_values = sb.color_palette(color_palette, len(label_map.keys())) hex_values = [colors.to_hex(col) for col in rgb_values] color_map = dict(zip(label_map.keys(), hex_values)) elif df[condition_col].dtype == 'bool': condition = df[condition_col] default_label_with_condition = "= {}".format(condition_col) default_label_no_condition = "¬ {}".format(condition_col) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} else: raise ValueError('Don\'t know how to plot data of type\ {}'.format(df[condition_col].dtype)) # produce kmf plot for each category (group) identified above kmf = KaplanMeierFitter() grp_desc = list() grp_survival_data = dict() grp_event_data = dict() grp_names = list(condition.unique()) for grp_name, grp_df in df.groupby(condition): grp_survival = grp_df[survival_col] grp_event = (grp_df[censor_col].astype(bool)) grp_label = label_map[grp_name] grp_color = color_map[grp_name] kmf.fit(grp_survival, grp_event, label=grp_label) desc_str = "# {}: {}".format(grp_label, len(grp_survival)) grp_desc.append(desc_str) grp_survival_data[grp_name] = grp_survival grp_event_data[grp_name] = grp_event if ax: ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color) else: ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color) ## format the plot # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) y_tick_vals = ax.get_yticks() ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals]) # plot title if title: ax.set_title(title) elif print_as_title: ax.set_title(' | '.join(grp_desc)) else: [print(desc) for desc in grp_desc] # axis labels if xlabel: ax.set_xlabel(xlabel) if ylabel: ax.set_ylabel(ylabel) ## summarize analytical version of results ## again using same groups as are plotted if len(grp_names) == 2: # use log-rank test for 2 groups results = logrank_test(grp_survival_data[grp_names[0]], grp_survival_data[grp_names[1]], event_observed_A=grp_event_data[grp_names[0]], event_observed_B=grp_event_data[grp_names[1]]) elif len(grp_names) == 1: # no analytical result for 1 or 0 groups results = NullSurvivalResults() else: # cox PH fitter for >2 groups cf = CoxPHFitter() cox_df = patsy.dmatrix('+'.join([condition_col, survival_col, censor_col]), df, return_type='dataframe') del cox_df['Intercept'] results = cf.fit(cox_df, survival_col, event_col=censor_col) results.print_summary() # add metadata to results object so caller can print them results.survival_data_series = grp_survival_data results.event_data_series = grp_event_data results.desc = grp_desc return results
from lifelines.statistics import logrank_test def seen_death(row): recent = datetime.datetime(2014,9,15) - datetime.timedelta(days=int(df["max_interval"].mean())) return row["dates"][-1] < recent connection = pymongo.MongoClient('localhost', 27017) communities = connection.database_names() for db in ["gender", "admin", "local", "visualizations", "results"]: if db in communities:communities.remove(db) results_db = connection['results']['question_3'] kmf = KaplanMeierFitter() for community in communities: community_db = connection[community]['statistics'] cursor = community_db.find({'contributions_total': {'$gt':0}, 'gender': {'$ne': "Unknown"} }, {u'_id': False, 'lifetime': True, 'max_interval': True, u'gender':True, 'activity_freq': True, 'dates':True} ) df = pandas.DataFrame(list(cursor)) df["dead"] = df.apply(seen_death, axis=1) males = df[df['gender']=='Male'] females = df[df['gender']=='Female']
def plot_kmf(df, condition_col, censor_col, survival_col, threshold=None, title=None, xlabel=None, ax=None, print_as_title=False): """ Plot survival curves by splitting the dataset into two groups based on condition_col if threshold is defined, the groups are split based on being > or < condition_col if threshold == 'median', the threshold is set to the median of condition_col Parameters ---------- df: dataframe condition_col: string, column which contains the condition to split on survival_col: string, column which contains the survival time censor_col: string, threshold: int or string, if int, condition_col is thresholded, if 'median', condition_col thresholded at its median title: Title for the plot, default None ax: an existing matplotlib ax, optional, default None print_as_title: bool, optional, whether or not to print text within the plot's title vs. stdout, default False """ kmf = KaplanMeierFitter() if threshold is not None: if threshold == 'median': threshold = df[condition_col].median() condition = df[condition_col] > threshold label = '{} > {}'.format(condition_col, threshold) else: condition = df[condition_col] label = '{}'.format(condition_col) df_with_condition = df[condition] df_no_condition = df[~condition] survival_no_condition = df_no_condition[survival_col] survival_with_condition = df_with_condition[survival_col] event_no_condition = (df_no_condition[censor_col].astype(bool)) event_with_condition = (df_with_condition[censor_col].astype(bool)) kmf.fit(survival_no_condition, event_no_condition, label="") if ax: kmf.plot(ax=ax, show_censors=True, ci_show=False) else: ax = kmf.plot(show_censors=True, ci_show=False) kmf.fit(survival_with_condition, event_with_condition, label=(label)) kmf.plot(ax=ax, show_censors=True, ci_show=False) # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) no_cond_str = "# no condition {}".format(len(survival_no_condition)) cond_str = "# with condition {}".format(len(survival_with_condition)) if title: ax.set_title(title) elif print_as_title: ax.set_title("%s | %s" % (no_cond_str, cond_str)) else: print(no_cond_str) print(cond_str) if xlabel: ax.set_xlabel(xlabel) results = logrank_test(survival_no_condition, survival_with_condition, event_observed_A=event_no_condition, event_observed_B=event_with_condition) return results
return t elif is_number(c['year_of_birth']) == True and is_number(c['age_at_diagnosis']) == True and is_number(c['days_to_death']) == False: t = 2018 - float(c['year_of_birth']) - (float(c['age_at_diagnosis'])*4/(365*3 + 366)) return t else: return "NotApplicable" matrix['duration'] = matrix.apply(duration, axis = 1) matrix['event'] = matrix.apply(event, axis = 1) matrix = matrix[['bcr_sample_barcode', 'duration', 'event']] #new_header = matrix.iloc[0] #grab the first row for the header #matrix = matrix[1:] #take the data less the header row #matrix.columns = new_header matrix = matrix[matrix['duration']!="NotApplicable"] kmf = KaplanMeierFitter() kmf.fit(durations = matrix.duration, event_observed = matrix.event) kmf.survival_function_ # plot the KM estimate kmf.plot() # Add title and y-axis label plt.title("The Kaplan-Meier Estimate for BRCA (total)") plt.ylabel("Probability a patient is still active") plt.show()
print(df.head()) ''' T E group 0 6 1 miR-137 1 13 1 miR-137 2 13 1 miR-137 3 13 1 miR-137 4 19 1 miR-137 ''' T = df['T'] E = df['E'] groups = df['group'] ix = (groups == 'miR-137') kmf = KaplanMeierFitter() kmf.fit(T[~ix], E[~ix], label='control') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='miR-137') kmf.plot(ax=ax) plt.ylabel('Survival Probability') plt.show() # Compare the two curves results = logrank_test(T[ix], T[~ix], event_observed_A=E[ix], event_observed_B=E[~ix]) results.print_summary()
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from lifelines import KaplanMeierFitter #kmf object for use throughout kmf = KaplanMeierFitter() #### Cleaning Procedure------------------------- #Lodaing csv from local machine (see repository for data, originally from Kaggle) ans = pd.read_csv("~\\Answers.csv", encoding='latin-1') qus = pd.read_csv("~\Questions.csv", encoding='latin-1') #Reducing answers to best score for each question ans = ans.sort_values(['ParentId', 'Score'], ascending=[True, False]) ans = ans.drop_duplicates(subset='ParentId') #Merging questions and answers together using left outer-join sf = pd.merge(qus, ans, how='left', left_on='Id', right_on='ParentId') # Altering the answer scores for modeling. For our purposes, we will have any question with 3 or few votes be considered "unanswered" sf['event'] = sf.Score_y >= 3 #Creating time variables, which will show # of hours it takes for a question to receive its highest score. sf['ans_date'] = pd.to_datetime(sf['CreationDate_y']) sf['ask_date'] = pd.to_datetime(sf['CreationDate_x']) sf['duration'] = sf['ans_date'] - sf['ask_date'] sf['duration_min'] = sf['duration'].dt.total_seconds() / 60 sf['duration_hr'] = sf["duration_min"] / 60
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") kmf_high.plot(ax = ax, color = "red", show_censors=True, ci_show=False) kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color = 'black', fontsize = 11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
# Censor some observations cutoff = 30 # Generate a censor length cutoff = np.repeat(cutoff, N) duration = np.minimum(event_t,cutoff) # "Cut-off" observations over cutoff level not_censor = event_t <= duration # generate a boolean indicator of censoring not_censor = not_censor.astype(int) # convert boolean to zeroes and ones # Convert to data frame data = pd.DataFrame({'duration': duration, 'event': not_censor, 'age': age, 'college': college}) # Plot observations with censoring # plot_lifetimes(duration, event_observed = not_censor) # Kaplan Meier Summary for Simulated Data from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() kmf.fit(duration, event_observed = not_censor) kmf.survival_function_.plot() # Cox-PH Model Regression from lifelines import CoxPHFitter cf = CoxPHFitter() cf.fit(data, 'duration', event_col = 'event') cf.print_summary() ## Get Predictions from Model ## # 24 year old college grad #college_24 = pd.DataFrame({'age':[24], 'college':[1]}) #cf.predict_survival_function(college_24).plot()
preds_bootfull_mat = np.concatenate(preds_bootfull, axis=1) inds_inbag_mat = np.array(inds_inbag).T inbag_mask = 1 * np.array([ np.any(inds_inbag_mat == _, axis=0) for _ in range(inds_inbag_mat.shape[0]) ]) preds_bootave_oob = np.divide( np.sum(np.multiply((1 - inbag_mask), preds_bootfull_mat), axis=1), np.sum(1 - inbag_mask, axis=1)) risk_groups = 1 * (preds_bootave_oob > np.median(preds_bootave_oob)) wdf = pd.DataFrame(np.concatenate( (y_orig, preds_bootave_oob[:, np.newaxis], risk_groups[:, np.newaxis]), axis=-1), columns=['status', 'time', 'preds', 'risk_groups'], index=[str(_) for _ in risk_groups]) kmf = KaplanMeierFitter() ax = plt.subplot(111) kmf.fit(durations=wdf.loc['0', 'time'], event_observed=wdf.loc['0', 'status'], label="Low Risk") ax = kmf.plot(ax=ax) kmf.fit(durations=wdf.loc['1', 'time'], event_observed=wdf.loc['1', 'status'], label="High Risk") ax = kmf.plot(ax=ax) plt.ylim(0, 1) plt.title("Kaplan-Meier Plots") plt.xlabel('Time (days)') plt.ylabel('Survival Probability')
def kmf(self): return KaplanMeierFitter()
early_stopping = EarlyStopping(monitor='loss', patience=2) history = model.fit(x_train, y_train, batch_size=256, epochs=100000, callbacks=[early_stopping]) y_pred = model.predict_proba(x_train, verbose=0) #Example of finding model-predicted survival probability. #Predicted survival prob. for first individual at follow-up time of 30 days: pred_surv = nnet_survival.nnet_pred_surv( model.predict_proba(x_train, verbose=0), breaks, 30) print(pred_surv[0]) #Plot predicted vs. actual survival kmf = KaplanMeierFitter() kmf.fit(t, event_observed=f) plt.plot(breaks, np.concatenate(([1], np.cumprod(y_pred[0, :]))), 'bo-') plt.plot(kmf.survival_function_.index.values, kmf.survival_function_.KM_estimate, color='k') plt.xlabel('Follow-up time (days)') plt.ylabel('Proportion surviving') plt.title( 'All patients from same survival distribution, no censoring. Actual=black, predicted=blue.' ) plt.show() ############################################################################ #Flexible model (non-proportional hazards). #All pts with same exponential survival distribution, some patients censored
def execute(): matplotlib.rc("font", size=20) engine, session = database.initialize("sqlite:///../data/isrid-master.db") # Query with Group.size may take awhile, at least for Charles # Not sure why query = session.query(Incident.total_hours, Subject.survived, Group.category, Group.size).join(Group, Subject) print("Tabulating query... may take awhile for unknown reasons.") df = tabulate(query) print("Done tabulating.") print(df.describe()) database.terminate(engine, session) df = df.assign( days=[total_hours.total_seconds() / 3600 / 24 for total_hours in df.total_hours], doa=[not survived for survived in df.survived], ) df = df[0 <= df.days] rows, columns = 2, 2 grid, axes = plt.subplots(rows, columns, figsize=(15, 10)) categories = Counter(df.category) plot = 0 kmfs = [] options = {"show_censors": True, "censor_styles": {"marker": "|", "ms": 6}, "censor_ci_force_lines": False} for category, count in categories.most_common()[: rows * columns]: print("Category:", category) ax = axes[plot // columns, plot % columns] df_ = df[df.category == category] N, Ndoa = len(df_), sum(df_.doa) Srate = 100 * (1 - Ndoa / N) grp = df_[df_.size > 1] sng = df_[df_.size == 1] kmf = KaplanMeierFitter() # kmf.fit(df_.days, event_observed=df_.doa, label=category) # kmf.plot(ax=ax, ci_force_lines=True) kmf.fit(grp.days, event_observed=grp.doa, label=category + " Groups") kmf.plot(ax=ax, **options) kmf.fit(sng.days, event_observed=sng.doa, label=category + " Singles") kmf.plot(ax=ax, **options) kmfs.append(kmf) ax.set_xlim(0, min(30, 1.05 * ax.get_xlim()[1])) ax.set_ylim(0, 1) ax.set_title("{}, N = {}, DOA = {}, {:.0f}% surv".format(category, N, Ndoa, Srate)) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") # ax.legend_.remove() # ax.grid(True) plot += 1 grid.suptitle("Kaplan-Meier Survival Curves", fontsize=25) grid.tight_layout() grid.subplots_adjust(top=0.9) grid.savefig("../doc/figures/kaplan-meier/km-grid-large.svg", transparent=True) combined = plt.figure(figsize=(15, 10)) ax = combined.add_subplot(1, 1, 1) for kmf in kmfs[: rows * columns]: kmf.plot(ci_show=False, show_censors=True, censor_styles={"marker": "|", "ms": 6}, ax=ax) ax.set_xlim(0, 15) ax.set_ylim(0, 1) ax.set_xlabel("Total Incident Time (days)") ax.set_ylabel("Probability of Survival") ax.set_title("Kaplan-Meier Survival Curves", fontsize=25) ax.grid(True) combined.savefig("../doc/figures/kaplan-meier/km-combined-large.svg", transparent=True) plt.show()