Example #1
0
File: km.py Project: xcodevn/SADP
def fun(epsilon):
    li = []
    for kk in range(100):
        newdata_= laplace_mechanism(his , np.sqrt(2.0) / epsilon)

        newdata = [max([0.0, d]) for d in newdata_]

        ntime  = np.asarray([])
        nevent = np.asarray([])
        for i in range(bins0):
            ntime = np.append(ntime, np.linspace(bin_edges0[i], bin_edges0[i+1] , newdata[i]))
            #ntime = np.append(ntime, np.ones(newdata[i]) * 0.5 * (bin_edges0[i+1] + bin_edges0[i] )) # , newdata[i]))
            nevent = np.append(nevent,np.zeros(newdata[i]))

        for i in range(bins1):
            ntime = np.append(ntime,np.linspace(bin_edges1[i], bin_edges1[i+1], newdata[bins0 + i]))
            #ntime = np.append(ntime, np.ones(newdata[bins0 + i]) * 0.5 * (bin_edges1[i+1] +  bin_edges1[i] )) # , newdata[i]))
            nevent = np.append(nevent, np.ones(newdata[bins0+i]))

        kmf1 = KaplanMeierFitter()
        kmf1.fit(ntime, event_observed=nevent)
        #naf1.fit(ntime, event_observed=nevent)
        out = kmf1.predict(kmf.timeline)
        #pyplot.plot (naf1.timeline, naf1.cumulative_hazard_.values)
        #pyplot.plot (naf.timeline, naf.cumulative_hazard_.values)
        #pyplot.show()

        mre = ( np.linalg.norm(out - true_value[:,0]) / np.linalg.norm(true_value[:,0]) )
        li.append(mre)
    avg = np.average( li )
    #mean_relative_error.append(avg)
    print "(%f, %f)" % (epsilon, avg)
Example #2
0
def kaplan_meier(out, t, ttype):
    def make_label(ttype, nobs):
        return "Rand%d; %d obs." % (ttype, nobs)

    kmf = KaplanMeierFitter()
    kmf.fit(t, event_observed=out, label=make_label(ttype=ttype, nobs=len(out)))
    return kmf
def plot_Kaplan_Meier_feature(donor_dataset):
    '''Accepts a dataframe of donor data.  For each feature (column), it plots the Kaplan-Meier curves of the donors based on whether the feature is true or false.  The active donors ('censored') will be excluded from the plot.

    Parameters:
    donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'.  'Total_years' represents how many years the donors have been active.  'censored' indicates whether a donor is still active (True = active donor).

    Output:
    Kaplan-Meier plot(s).

    This function does not return anything.
    '''
    T = donor_dataset['Total_years']
    C = donor_dataset['censored']
    features = list(donor_dataset.columns)
    features.remove('Total_years')
    features.remove('censored')
    features.remove('Baseline')
    kmf = KaplanMeierFitter()
    for feature in features:
        Above_mean = donor_dataset[feature] > donor_dataset[donor_dataset['censored'] == 0][feature].mean()
        fig = plt.figure(figsize=(5, 5))
        ax = fig.add_subplot(111)
        kmf = KaplanMeierFitter()
        kmf.fit(T[Above_mean], C[Above_mean], label = feature + ': Yes or > mean')
        kmf.plot(ax=ax, linewidth = 2)
        kmf.fit(T[~Above_mean], C[~Above_mean], label = feature + ': No or < mean')
        kmf.plot(ax=ax, linewidth = 2)
        ax.set_xlabel('Years', size = 10)
        ax.set_ylabel('Surviving donor population', size = 10)
        ax.set_xlim(0,40)
        ax.set_ylim(0, 1)
        ax.grid()
        ax.legend(loc = 'upper right', fontsize = 10)
        plt.show()
Example #4
0
def survival_analysis(dataframe, grouping, years = 5):
	# remove patients with null values
	df2 = dataframe.dropna(subset = [grouping])
	df2 = df2.dropna(subset = ['_OS'])
	df2 = df2.dropna(subset = ['_EVENT'])

	# limit analysis to number of years specified
	df2['survival'] = np.nan
	df2['event'] = np.nan
	maxtime = years * 365
	df2['survival'][(df2['_OS'] > maxtime)] = maxtime
	df2['event'][(df2['_OS'] > maxtime)] = 0
	df2['survival'][(df2['_OS'] <= maxtime)] = df2['_OS']
	df2['event'][(df2['_OS'] <= maxtime)] = df2['_EVENT']

	# get groups
	grouped_data = df2.groupby(grouping)
	unique_groups = list(grouped_data.groups.keys())
	unique_groups.sort()

	#plot survival curve
	kmf = KaplanMeierFitter()
	ax = plt.subplot(111)
	for i, group in enumerate(unique_groups):
		data = grouped_data.get_group(group)
		kmf.fit(data['survival'], data['event'], label = group)
		# print(data['_OS'])
		kmf.plot(ax=ax, show_censors = True)

	plt.show()
Example #5
0
	def __KM_analysis(self,duration_table,expressed_array,unexpressed_array,freq_set):
		data = {}
		expressed_T = []
		expressed_C = []
		unexpressed_T = []
		unexpressed_C = []
		for idx,row in enumerate(duration_table):
			if(idx>0):
				if row[0] in unexpressed_array and row[1] !=  "NA" and row[2] !=  "NA":
					unexpressed_T.append(float(row[1]))
					unexpressed_C.append(int(row[2]))
				elif row[0] in expressed_array and row[1] != "NA" and row[2] !=  "NA":
					expressed_T.append(float(row[1]))
					expressed_C.append(int(row[2]))

		results = logrank_test(expressed_T, unexpressed_T, expressed_C, unexpressed_C, alpha=.95 )
		if(results.p_value < .0006):
			ax = plt.subplot(111)
			kmf = KaplanMeierFitter()
			kmf.fit(expressed_T, event_observed=expressed_C, label="Satisfying")
			kmf.plot(ax=ax, ci_force_lines=False)
			kmf.fit(unexpressed_T, event_observed=unexpressed_C, label="None-Satisfying")
			kmf.plot(ax=ax, ci_force_lines=False)
			plt.ylim(0,1)
			plt.title("Lifespans ("+str(freq_set)+")")
			plt.show()	
		return results.p_value
Example #6
0
def plot_survival_curves(rec_t, rec_e, antirec_t, antirec_e, experiment_name = '', output_file = None):
    # Set-up plots
    plt.figure(figsize=(12,3))
    ax = plt.subplot(111)

    # Fit survival curves
    kmf = KaplanMeierFitter()
    kmf.fit(rec_t, event_observed=rec_e, label=' '.join([experiment_name, "Recommendation"]))   
    kmf.plot(ax=ax,linestyle="-")
    kmf.fit(antirec_t, event_observed=antirec_e, label=' '.join([experiment_name, "Anti-Recommendation"]))
    kmf.plot(ax=ax,linestyle="--")
    
    # Format graph
    plt.ylim(0,1);
    ax.set_xlabel('Timeline (months)',fontsize='large')
    ax.set_ylabel('Percentage of Population Alive',fontsize='large')
    
    # Calculate p-value
    results = logrank_test(rec_t, antirec_t, rec_e, antirec_e, alpha=.95)
    results.print_summary()

    # Location the label at the 1st out of 9 tick marks
    xloc = max(np.max(rec_t),np.max(antirec_t)) / 9
    if results.p_value < 1e-5:
        ax.text(xloc,.2,'$p < 1\mathrm{e}{-5}$',fontsize=20)
    else:
        ax.text(xloc,.2,'$p=%f$' % results.p_value,fontsize=20)
    plt.legend(loc='best',prop={'size':15})


    if output_file:
        plt.tight_layout()
        pylab.savefig(output_file)
def kmplot(df_high, df_low):
	kmf_high = KaplanMeierFitter()
	kmf_low = KaplanMeierFitter()
	try:
		kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high)))
		kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low)))
	except ValueError:
		return("NA", "0", "0", "0", "0")

	statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event)
	p_value = statistics_result.p_value
                                       
	hm5 = kmf_high.predict(60)
	hm10 = kmf_high.predict(120)
	lm5 = kmf_low.predict(60)
	lm10 = kmf_low.predict(120)
	return(p_value, hm5, hm10, lm5, lm10)
Example #8
0
def surAnalysis(storeId):
    duration = []
    observed = []
    
    for elem in survival.find({'store_id':storeId}):
        duration.append(elem['duration']/86400)
        observed.append(elem['observed'])
    if duration==[]:
        pass
    else:
        dura_obj = array(duration)
        obs_obj = array(observed)
        
        kmf = KaplanMeierFitter()
        kmf.fit(dura_obj,obs_obj)
        ax = kmf.plot()
        #ax.set_xlim(0,1)
        #ax.set_ylim(0.85,1.0)
        ax.get_figure().savefig('F:\workshop\lbs_lyf\static\images\\' + storeId)
        plt.close(ax.get_figure())
Example #9
0
def generate_plot():  # Perhaps `regenerate_plot`?
    """ Dynamically fit and plot a Kaplan-Meier curve. """
    df_ = df.copy()

    # Use constraints
    for index in range(len(categories)):
        if index not in category_select.active:
            df_ = df_[df_.category != category_select.labels[index]]

    df_ = df_[min_size_select.value <= df_['size']]
    df_ = df_[df_['size'] <= max_size_select.value]

    df_ = df_[min_age_select.value <= df_.age]
    df_ = df_[df_.age <= max_age_select.value]

    if 0 not in sex_select.active:  # Male
        df_ = df_[df_.sex != 1]
    if 1 not in sex_select.active:  # Female
        df_ = df_[df_.sex != 2]

    if len(df_) == 0:  # Bad constraints
        status.text = 'No cases found. Try different constraints.'
        return

    doa = [not survived for survived in df_.survived]

    kmf = KaplanMeierFitter()
    fit = kmf.fit(df_.days, event_observed=doa, label='prob_of_surv')

    # Here, we are using the smoothed version of the Kaplan-Meier curve
    # The stepwise version would work just as well

    data, surv_func = renderer.data_source.data, fit.survival_function_
    data.update(x=surv_func.index, y=surv_func.prob_of_surv)

    start, end = 0, max(df_.days)
    # bounds='auto' doesn't work?
    plot.x_range.update(start=start, end=end, bounds=(start, end))
    status.text = '{} cases found.'.format(len(df_))
def plot_Kaplan_Meier_overall(donor_dataset):
	'''Accepts a dataframe of donor data.  Plots the overall Kaplan-Meier curve based of the lifetime of the donors.  The active donors ('censored') will be excluded from the plot.

	Parameters:
	donor_dataset: Pandas dataframe which contain at least the columns 'Total-years' and 'censored'.  'Total_years' represents how many years the donors have been active.  'censored' indicates whether a donor is still active (True = active donor).

	Output:
	A Kaplan-Meier plot.

	This function does not return anything.

	'''
	#This produces two data frames of the columns 'Total_years'
	#and 'censored.'  The former indicates how manay years a
	#donor has donoted before she/he churned.  The latter indicates
	#whether the donor is censored (not churned).  Only donor who
	#has churned (not censored) are used because we don't know the
	#'Total_years' of donors who have not churned yet.
	T = donor_dataset['Total_years']
	C = donor_dataset['censored']

	#Create KaplanMeierInstance
	kmf = KaplanMeierFitter()
	kmf.fit(T, C, label = 'Overall')

	#plot KM function
	fig = plt.figure(figsize=(5, 5))
	ax = fig.add_subplot(111)
	kmf.plot(ax=ax)
	ax.set_xlabel('Years', size = 20)
	ax.set_ylabel('Surviving donor population', size = 20)
	ax.set_xlim(0,40)
	ax.set_ylim(0, 1)
	ax.grid()
	ax.legend(loc = 'best', fontsize = 20)
	plt.show()
	return
Example #11
0
def get_sa(request):
    dirname = os.path.dirname(os.path.dirname(__file__)).replace('\\', '/')
    kmffile = '/images/test1.jpg'
    naffile = '/images/test2.jpg'
    context = {}
    context['kmf'] = kmffile
    context['naf'] = naffile
    if not os.path.exists(dirname + kmffile) and not os.path.exists(dirname + naffile):
        df = load_waltons()
        T = df['T']  # an array of durations
        E = df['E']  # a either boolean or binary array representing whether the 'death' was observed (alternatively an individual can be censored)
        kmf = KaplanMeierFitter(alpha=0.95)
        kmf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None)

        naf = NelsonAalenFitter(alpha=0.95, nelson_aalen_smoothing=True)
        naf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None)

        kmf.plot()
        plt.savefig(dirname + kmffile)
        naf.plot()
        plt.savefig(dirname + naffile)

    # return render_to_response(template_name='sa_test.html', context=context, context_instance=RequestContext(request=request))
    return render(request=request, template_name='sa_test.html', context=context)
Example #12
0
def survival(time, status, pGroups=None):
  kmf = KaplanMeierFitter()
  if pGroups is None:
    order = [i for i in range(2, len(time)) 
		if time[i] != "" and status[i] != ""]
    t = [float(time[i]) for i in order]
    s = [int(status[i]) for i in order]
    kmf.fit(t, s)
    ax = kmf.plot(color='red')
    return ax
  else:
    ax = None
    groups = [ "" for i in time]
    for k in range(len(pGroups)):
      df = pd.DataFrame()
      order = [i for i in pGroups[k][2]
               if time[i] != "" and status[i] != ""]
      if len(order) <= 0:
          continue
      for i in order:
        groups[i] = k
      t = [float(time[i]) for i in order]
      s = [int(status[i]) for i in order]
      kmf.fit(t, s, label = pGroups[k][0])
      if ax is None:
        ax = kmf.plot(color=pGroups[k][1], ci_show=False, show_censors=True)
      else:
        ax = kmf.plot(ax = ax, color=pGroups[k][1], ci_show=False, show_censors=True)
    order = [i for i in range(len(groups)) if groups[i] != ""]
    if len(order) > 0:
      t = [float(time[i]) for i in order]
      s = [int(status[i]) for i in order]
      g = [int(groups[i]) for i in order]
      from lifelines.statistics import multivariate_logrank_test
      from matplotlib.legend import Legend
      res = multivariate_logrank_test(t, g, s)
      leg = Legend(ax, [], [], title = "p = %.2g" % res.p_value,
                   loc='lower left', frameon=False)
      ax.add_artist(leg);
    return ax
Example #13
0
import pandas as pd
from lifelines import KaplanMeierFitter
from matplotlib import pyplot as plt

time_day_life = [150, 130, 300, 100, 80, 60, 270, 150, 82, 50]
tag_sale = [1, 0, 0, 1, 0, 1, 0, 1, 0, 1]

df = pd.DataFrame({'time_day_life': time_day_life, 'tag_sale': tag_sale})
df.sort_values('time_day_life', ascending=True)

print('Descriptive')
print(df.time_day_life.mean())
print('')
print(df.groupby('tag_sale').agg('mean'))
## Observamos que el tiempo de vida medio no es comparable entre los leads vendidos y no vendidso con respecto al promedio general

#Calculo del estadistico Kaplan-Meier
# Curva Kaplan-Meier
kmf = KaplanMeierFitter()
kmf.fit(durations=df.time_day_life, event_observed=df.tag_sale)
kmf.survival_function_

# Plot survival analysis
kmf.plot(label='Kaplan-Meier',
         figsize=(12, 12),
         show_censors=True,
         at_risk_counts=True)
plt.xlabel('tiempo de vida inmueble  en dias', size=15)
plt.ylabel('Sobrevida - $P(T>t)$', size=15)
Example #14
0
def qq_plot(model, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax: axis object

    Examples
    ---------

    >>> from lifelines import *
    >>> from lifelines.plotting import qq_plot
    >>> from lifelines.datasets import load_rossi
    >>> df = load_rossi()
    >>> wf = WeibullFitter().fit(df['week'], df['arrest'])
    >>> qq_plot(wf)


    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    set_kwargs_ax(plot_kwargs)
    ax = plot_kwargs.pop("ax")

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations,
                                                     model.event_observed,
                                                     label=COL_EMP)
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations,
                                                      model.event_observed,
                                                      label=COL_EMP)
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError()

    q = np.unique(kmf.cumulative_density_.values[:, 0])
    quantiles = qth_survival_times(q, kmf.cumulative_density_, cdf=True)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO,
                           COL_EMP,
                           c="none",
                           edgecolor="k",
                           lw=0.5,
                           ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Example #15
0
def CoxRegressionModel(stimes, N, ap_s, ap_s2, ap_s3, ap_s5, ebb_, eap_, ebs,
                       aps_0, mu, stime, Np):

    #ebs = singular value = 0, 0.175, 0.35, 0.525, 0.7
    #stimes = 1050 values ( 5 eb values x 7 ap values x 30 survival times)
    #stime = 30 values for each aps_) value
    #N = 1050
    #Np = 30
    #ap_s, ap_s2, ap_s3 = array of 1050 values used for the data frame

    #*************************************EVENT OBSERVATION**************************************************#
    E = np.zeros(N).astype(int)

    for i, time in enumerate(stimes):
        if (time > 62831):
            E[i] = 0
        else:
            E[i] = 1

    #**************************************MAKING A DATA FRAME************************************************#
    data1 = {
        'T': stimes,
        'E': E,
        'aps': ap_s,
        'aps2': ap_s2,
        'aps3': ap_s3,
        'aps5': ap_s5,
        'eap': eap_,
        'eb': ebb_
    }
    df = pd.DataFrame(data=data1)

    T = df['T']
    E = df['E']
    aps = df['aps']
    aps2 = df['aps2']
    aps3 = df['aps3']
    aps5 = df['aps5']
    eap = df['eap']
    eb = df['eb']

    #print(df)

    #************************************COX PH FITTER*************************************#

    fig, axes = plt.subplots()

    axes.set_xscale('log')
    axes.set_ylabel("S(t)")

    KT, KE, Kdf = PlottingLL.PlottingLL(ebs, aps_0, mu, stime, Np)
    kmf = KaplanMeierFitter().fit(KT, KE, label='KaplanMeierFitter')
    kmf.plot_survival_function(ax=axes)

    cph = CoxPHFitter()

    #cph.fit(df,duration_col = 'T', event_col = 'E', formula = "eap")
    #cph.fit(df,duration_col = 'T', event_col = 'E')

    cph.fit(df, duration_col='T', event_col='E', formula="aps + I(aps**3)")
    #cph.print_summary()
    cph.plot_partial_effects_on_outcome(plot_baseline=False,
                                        ax=axes,
                                        cmap="coolwarm")
    #cph.plot_partial_effects_on_outcome(covariates = ['aps'], values = [round(aps_0,3)], plot_baseline = False, ax = axes, cmap = "coolwarm")
    cph.baseline_survival_.plot(ax=axes, ls=":", color=f"C{i}")

    #cph.fit(df,duration_col = 'T', event_col = 'E', formula = "eb + aps + I(aps**3)")
    #cph.print_summary()
    #cph.plot_partial_effects_on_outcome(covariates = ['aps'], values = [round(aps_0,3)], ax = axes)

    plt.title('Formula(aps vs. aps3 (1050 values)): (eb,ap,mu)={}'.format(
        (round(ebs, 3), round(aps_0, 3), mu)),
              fontsize=12)
Example #16
0
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.utils import datetimes_to_durations
from matplotlib import pyplot as plt

data = pd.read_csv('durations.csv')
data['Duration'] = data['Duration'] / (60 * 60 * 24) 
data = data[data['Duration'] < 2500]

company1 = (data['Company'] == 'sulake')
company2 = (data['Company'] == 'paypal')
company3 = (data['Company'] == 'alibaba')

kmf = KaplanMeierFitter()

kmf.fit(data[company1]['Duration'], data[company1]['Observed'])
ax = kmf.plot()

kmf.fit(data[company2]['Duration'], data[company2]['Observed'])
ax = kmf.plot(ax=ax)

kmf.fit(data[company3]['Duration'], data[company3]['Observed'])
ax = kmf.plot(ax=ax)

plt.show()

Example #17
0
duration = []
observed = []
group = []

for elem in after_users.find():
    #if elem['duration'] >=1500000:
    duration.append(elem['duration']/86400)
    observed.append(elem['observed'])
    group.append(elem['gender'])
dura_obj = array(duration)
obs_obj = array(observed)
group_obj = array(group)
DataFrame(dura_obj,index=group_obj)
DataFrame(obs_obj,index=group_obj)
male = group_obj ==1
female = group_obj ==2
other = group_obj ==0

kmf = KaplanMeierFitter()
kmf.fit(dura_obj[male],obs_obj[male], label = 'male')
ax = kmf.plot()
kmf.fit(dura_obj[female],obs_obj[female], label = 'female')
kmf.plot(ax=ax)
kmf.fit(dura_obj,obs_obj, label = 'both')
kmf.plot(ax=ax)
#kmf.fit(dura_obj[other],obs_obj[other], label = 'other')
#kmf.plot(ax=ax)
#ax.set_xlim(19,22)
#ax.set_ylim(1,2)
ax.get_figure().savefig('maleAndFemale_both_17day')
Example #18
0
    def __init__(self, db, male=False, female=False, other=False, both=True):
        self.db = db
        self.male = male
        self.female = female
        self.other = other
        self.both = both

        duration = []
        observed = []
        group = []

        for elem in self.db.find():
            duration.append(elem['duration'] / 86400)
            observed.append(elem['observed'])
            group.append(elem['gender'])
        dura_obj = array(duration)
        obs_obj = array(observed)
        group_obj = array(group)
        DataFrame(dura_obj, index=group_obj)
        DataFrame(obs_obj, index=group_obj)
        male = group_obj == 1
        female = group_obj == 2
        other = group_obj == 0

        kmf = KaplanMeierFitter()
        kmf.fit(dura_obj, obs_obj, label='both')
        ax = kmf.plot()
        if self.male is True:
            kmf.fit(dura_obj[male], obs_obj[male], label='male')
            kmf.plot(ax=ax)
        if self.female is True:
            kmf.fit(dura_obj[female], obs_obj[female], label='female')
            kmf.plot(ax=ax)
        if self.other is True:
            kmf.fit(dura_obj[other], obs_obj[other], label='other')
            kmf.plot(ax=ax)
        # ax.set_xlim(19,22)
        # ax.set_ylim(1,2)
        ax.get_figure().savefig('maleAndFemale')
Example #19
0
    def data_fit(self):
        user_list = []
        self.hyd_events.create_index('FromUserName')
        self.hyd_events.create_index('Event')
        self.hyd_users.create_index('openid')
        for elem in self.hyd_events.find({'Event': 'subscribe'}):
            user_list.append(elem['FromUserName'])
        user_list = list(set(user_list))
        print len(user_list)
        now_time = time.time()

        # add subscribe time
        # three tag: pic, text, event
        # format: 'user_id':'', 'sub_time':'', 'unsub_time':'', 'event':''.
        duration = []
        observed = []
        group = []

        time_block = []
        for elem in user_list:
            user_dict = {}
            for item in self.hyd_events.find({'FromUserName': elem}):
                time_block.append(item['CreateTime'])
            earlist = min(time_block)
            latest = max(time_block)
            sub_time = int(earlist)
            curt = self.hyd_events.find_one({'$and': [{'FromUserName': elem}, {'Event': 'unsubscribe'}]})
            if curt is None:
                unsub_time = int(now_time)
                user_dict['observed'] = 0
            else:
                unsub_time = int(latest)
                user_dict['observed'] = 1

            try:
                user_dict['duration'] = abs(unsub_time - sub_time)
            except Exception, e:
                print e
                print unsub_time
                print sub_time
            check = self.hyd_users.find_one({'openid': elem})
            # if gender exists, set it, if not, set gender=0, which means gender unknow
            try:
                user_dict['gender'] = check['sex']
            except TypeError:
                user_dict['gender'] = 0

            duration.append(user_dict['duration'] / 86400)
            observed.append(user_dict['observed'])
            group.append(user_dict['gender'])
            dura_obj = array(duration)
            obs_obj = array(observed)
            group_obj = array(group)
            DataFrame(dura_obj, index=group_obj)
            DataFrame(obs_obj, index=group_obj)
            male = group_obj == 1
            female = group_obj == 2
            other = group_obj == 0

            kmf = KaplanMeierFitter()
            kmf.fit(dura_obj, obs_obj, label='both')
            ax = kmf.plot()
            ax.get_figure().savefig('maleAndFemale')
Example #20
0
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------
    .. code:: python

        from lifelines import *
        from lifelines.plotting import qq_plot
        from lifelines.datasets import load_rossi
        df = load_rossi()
        wf = WeibullFitter().fit(df['week'], df['arrest'])
        qq_plot(wf)

    Notes
    ------
    The interval censoring case uses the mean between the upper and lower bounds.

    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations,
                                                     model.event_observed,
                                                     label=COL_EMP,
                                                     weights=model.weights,
                                                     entry=model.entry)
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[
            COL_EMP]
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations,
                                                      model.event_observed,
                                                      label=COL_EMP,
                                                      weights=model.weights,
                                                      entry=model.entry)
        sf, cdf = kmf.survival_function_[COL_EMP], kmf.cumulative_density_[
            COL_EMP]

    elif CensoringType.is_interval_censoring(model):
        kmf = KaplanMeierFitter().fit_interval_censoring(model.lower_bound,
                                                         model.upper_bound,
                                                         label=COL_EMP,
                                                         weights=model.weights,
                                                         entry=model.entry)
        sf, cdf = kmf.survival_function_.mean(1), kmf.cumulative_density_[
            COL_EMP + "_lower"]

    q = np.unique(cdf.values)

    quantiles = qth_survival_times(1 - q, sf)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO,
                           COL_EMP,
                           c="none",
                           edgecolor="k",
                           lw=0.5,
                           ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Example #21
0
    for row in df['vital_status']:
        if row not in ['Alive', 'Dead']:
            vital_status.append(None)
        else:
            vital_status.append(row)
    df['vital_status'] = vital_status
    df['SARS'] = df['SARS'].dropna()
    df = df[pd.notnull(df['duration'])]
    df = df[pd.notnull(df['SARS'])]
    df = df[pd.notnull(df['vital_status'])]
    lst = df['SARS'].tolist()
    q1 = np.percentile(lst, 33.33)
    q2 = np.percentile(lst, 66.66)
    df1 = df[df['SARS'] <= q1]
    df2 = df[(df['SARS'] > q1) & (df['SARS'] <= q2)]
    df3 = df[df['SARS'] > q2]
    plot_km(df, ax, '', file, "q1")
    ax.get_figure().savefig(result_dir + file + '_kmplot(samples=' +
                            str(len(df.index)) + ').png')


if __name__ == '__main__':
    for (dirpaths, dirnames, filenames) in os.walk(src_dir):
        for file in filenames:
            kmf = KaplanMeierFitter()
            ax = plt.subplot(111)
            print(file)
            df = pd.read_table(src_dir + file, sep=',', header=1)
            process_df(df, file, ax)
            plt.clf()
Example #22
0
    def test_kmf_with_inverted_axis(self, block, kmf):

        T = np.random.exponential(size=100)
        kmf = KaplanMeierFitter()
        kmf.fit(T, label="t2")
        ax = kmf.plot(invert_y_axis=True, at_risk_counts=True)

        T = np.random.exponential(3, size=100)
        kmf = KaplanMeierFitter()
        kmf.fit(T, label="t1")
        kmf.plot(invert_y_axis=True, ax=ax, ci_force_lines=False)

        self.plt.title("test_kmf_with_inverted_axis")
        self.plt.show(block=block)
# Import library 
from lifelines.datasets import load_waltons

# Load data frame
df = load_waltons()

# Print dataframe
print (df.head())

# Get separare frame for event and time
T = df['T']
E = df['E']


from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E) # more succiently, kmf.fit(T,E)


kmf.survival_function_
kmf.median_
kmf.plot()




#     Multiple groups
groups = df['group']
ix = (groups == 'miR-137')

kmf.fit(T[~ix], E[~ix], label='control')
#Griffin Calme
#Group 15, week 8 activity
#Kaplan Meier survival curve

import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
kmf = KaplanMeierFitter()

df = pd.DataFrame.from_csv('wk8gp15KapMeier.csv')

print(df)

groups = df['Group']
ix = (groups == 2)

T = df['SERIAL TIME (years)']
E = df['STATUS']

kmf.fit(T[~ix], E[~ix], label='1')
ax = kmf.plot()

kmf.fit(T[ix], E[ix], label='2')
kmf.plot(ax=ax, ci_force_lines=False)

plt.show()
Example #25
0
File: km.py Project: xcodevn/SADP
print "[***] K-M Estimator"

EPS_LIST = [0.05,0.1,0.2,0.4,0.8,1.6]

bins0 = config.BIN0
bins1 = config.BIN1

df = pd.read_stata("wichert.dta")
data_ = zip(df.time/max(df.time), df.event.astype(int))
data  = [(a, b) for (a,b) in data_ if a >= config.GAMMA]

print("[*] Remove #%d outliers" % (len(data_) - len(data)))
N  = len(df) # number of data points

kmf = KaplanMeierFitter()
(T, E) = zip(*data)
kmf.fit(T, event_observed=E)
#naf = NelsonAalenFitter()
#naf.fit(T, event_observed=E)
#ax = pyplot.subplot(121)
#naf.plot(ax=ax)

#ax = pyplot.subplot(122)
#kmf.plot(ax=ax)

true_value =  kmf.survival_function_.values
#naf.cumulative_hazard_.to_csv("naf.csv")

#pyplot.show()
Example #26
0
def multivariate_logrank_test(
        event_durations,
        groups,
        event_observed=None,
        t_0=-1,
        weightings=None,
        **kwargs) -> StatisticalResult:  # pylint: disable=too-many-locals
    r"""
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
    be equal when n=2):

    .. math::
        \begin{align}
         & H_0: h_1(t) = h_2(t) = h_3(t) = ... = h_n(t) \\
         & H_A: \text{there exist at least one group that differs from the other.}
        \end{align}


    Parameters
    ----------

    event_durations: iterable
        a (n,) list-like representing the (possibly partial) durations of all individuals

    groups: iterable
        a (n,) list-like of unique group labels for each individual.

    event_observed: iterable, optional
        a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed.

    t_0: float, optional (default=-1)
        the period under observation, -1 for all time.

    weightings: str, optional
        apply a weighted logrank test: options are "wilcoxon" for Wilcoxon (also known as Breslow), "tarone-ware"
        for Tarone-Ware, "peto" for Peto test and "fleming-harrington" for Fleming-Harrington test.
        These are useful for testing for early or late differences in the survival curve. For the Fleming-Harrington
        test, keyword arguments p and q must also be provided with non-negative values.

        Weightings are applied at the ith ordered failure time, :math:`t_{i}`, according to:
            Wilcoxon: :math:`n_i`
            Tarone-Ware: :math:`\sqrt{n_i}`
            Peto: :math:`\bar{S}(t_i)`
            Fleming-Harrington: :math:`\hat{S}(t_i)^p \times (1 - \hat{S}(t_i))^q`

            where :math:`n_i` is the number at risk just prior to time :math:`t_{i}`, :math:`\bar{S}(t_i)` is
            Peto-Peto's modified survival estimate and :math:`\hat{S}(t_i)` is the left-continuous
            Kaplan-Meier survival estimate at time :math:`t_{i}`.

    kwargs:
        add keywords and meta-data to the experiment summary.


    Returns
    -------

    StatisticalResult
       a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary``

    Examples
    --------

    .. code:: python

        df = pd.DataFrame({
           'durations': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
           'events': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
           'groups': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        })
        result = multivariate_logrank_test(df['durations'], df['groups'], df['events'])
        result.test_statistic
        result.p_value
        result.print_summary()


        # numpy example
        G = [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        T = [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7]
        E = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]
        result = multivariate_logrank_test(T, G, E)
        result.test_statistic


    See Also
    --------
    pairwise_logrank_test
    logrank_test
    """
    kwargs.setdefault("test_name", "multivariate_logrank_test")

    event_durations, groups = np.asarray(event_durations), np.asarray(groups)
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))
    else:
        event_observed = np.asarray(event_observed)

    n = np.max(event_durations.shape)
    assert n == np.max(event_durations.shape) == np.max(
        event_observed.shape), "inputs must be of the same length."
    groups, event_durations, event_observed = map(
        lambda x: pd.Series(np.asarray(x).reshape(n)),
        [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(
        groups, event_durations, event_observed, limit=t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    n_ij = rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0)
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev_i = n_ij.mul(d_i / n_i, axis="index")

    # compute weightings for log-rank alternatives
    if weightings is None:
        w_i = np.ones(d_i.shape[0])
    elif weightings == "wilcoxon":
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Wilcoxon")
        w_i = n_i
    elif weightings == "tarone-ware":
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Tarone-Ware")
        w_i = np.sqrt(n_i)
    elif weightings == "peto":
        kwargs["test_name"] = kwargs["test_name"].replace("logrank", "Peto")
        w_i = np.cumprod(1.0 - (ev_i.sum(1)) /
                         (n_i + 1))  # Peto-Peto's modified survival estimates.
    elif weightings == "fleming-harrington":
        if "p" in kwargs:
            p = kwargs["p"]
            if p < 0:
                raise ValueError("p must be non-negative.")
        else:
            raise ValueError(
                "Must provide keyword argument p for Flemington-Harrington test statistic"
            )
        if "q" in kwargs:
            q = kwargs["q"]
            if q < 0:
                raise ValueError("q must be non-negative.")
        else:
            raise ValueError(
                "Must provide keyword argument q for Flemington-Harrington test statistic"
            )
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Flemington-Harrington")
        kmf = KaplanMeierFitter().fit(event_durations,
                                      event_observed=event_observed)
        s = kmf.survival_function_.to_numpy().flatten(
        )[:-1]  # Left-continuous Kaplan-Meier survival estimate.
        w_i = np.power(s, p) * np.power(1.0 - s, q)
    else:
        raise ValueError("Invalid value for weightings.")

    # apply weights to observed and expected
    N_j = obs.mul(w_i, axis=0).sum(0).values
    ev = ev_i.mul(w_i, axis=0).sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum(
    )) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    factor = (((n_i - d_i) /
               (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2
    n_ij["_"] = n_i.values
    V_ = (n_ij.mul(w_i, axis=0)).mul(np.sqrt(factor),
                                     axis="index").fillna(0)  # weighted V_
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = V[ix, ix] - V[-1, ix]
    V = V[:-1, :-1]

    # take the first n-1 groups
    U = Z_j.iloc[:-1] @ np.linalg.pinv(
        V[:-1, :-1]) @ Z_j.iloc[:-1]  # Z.T*inv(V)*Z

    # compute the p-values and tests
    p_value = _chisq_test_p_value(U, n_groups - 1)
    return StatisticalResult(p_value,
                             U,
                             t_0=t_0,
                             null_distribution="chi squared",
                             degrees_of_freedom=n_groups - 1,
                             **kwargs)
Example #27
0
## Read Data from csv
fileName = 'Telco-Customer-Churn.csv'
input_df = pd.read_csv(fileName)

## Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data.
input_df['Churn'] = input_df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

## Convert TotalCharges to numeric
# input_df['TotalCharges']=pd.to_numeric(input_df['TotalCharges'],errors='coerce')

T = input_df['tenure']
E = input_df['Churn']
# print(T)

kmf = KaplanMeierFitter()
## Two Cohorts are compared.
# 1. Streaming TV Not Subscribed by users, and Cohort
# 2. Streaming TV subscribed by the users.
groups = input_df['StreamingTV']
i1 = (groups == 'No'
      )  ## group i1 , having the pandas series  for the 1st cohort
i2 = (groups == 'Yes'
      )  ## group i2 , having the pandas series  for the 2nd cohort

## fit the model for 1st cohort
kmf.fit(T[i1], E[i1], label='Not Subscribed StreamingTV')
a1 = kmf.plot()

## fit the model for 2nd cohort
kmf.fit(T[i2], E[i2], label='Subscribed StreamingTV')
Example #28
0
    gene_pos.append(int(i.split()[1]))

num_cols = len(exp_matrix)
mid = num_cols//2

for j in range(0, len(gene_pos)):
    pos = gene_pos[j]
    gene_exp = [exp_matrix[i][pos] for i in range(0,num_cols)]
    comb = [[gene_exp[i], tol[i]] for i in range(0, num_cols)]
    comb = sorted(comb, key=lambda x: x[0])
    low = comb[:mid]
    high = comb[mid:]
    #l_exp = [low[i][0] for i in range(0,mid)]
    l_tol = [low[i][1] for i in range(0, len(low))]
    l_out = [True]*len(l_tol)
    #h_exp = [high[i][0] for i in range(0,mid)]
    h_tol = [high[i][1] for i in range(0, len(high))]
    h_out = [True]*len(h_tol)

    kp = KaplanMeierFitter()
    label_low = 'Lower 50% (n = ' + str(len(low)) + ')'
    label_high = 'Upper 50% (n = ' + str(len(high)) + ')'
    len_high = len(high)
    kp.fit(l_tol, l_out, label=label_low)
    ax = kp.plot()
    kp.fit(h_tol, h_out, label=label_high)
    graph = kp.plot(ax=ax)
    plt.title("Survival Function of %s" %gene_name[j])
    graph.get_figure().savefig("%s_survival.png" %gene_name[j])

from lifelines import KaplanMeierFitter

import matplotlib.pyplot as plt

df = pd.read_csv('joined.csv.bz2', sep=',', compression='bz2', low_memory=False)

# strip ' months' in column 'term'
df['term'] = df['term'].map(lambda x: int(x.strip(' months')))

# prepare column 'T' for training survival model
df['T'] = df['firstMissed'] / df['term']
df.loc[df['loan_status']=='Fully Paid', 'T']=1

# column 'E' seems to be column 'censored'

T = df['T']
E = ~df['censored']


kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E) # more succiently, kmf.fit(T,E)


kmf.survival_function_
kmf.median_
kmf.plot()
plt.show()


Example #30
0
def _calibration_curve_ipcw(out,
                            e,
                            t,
                            a,
                            group,
                            eval_time,
                            typ,
                            ret_bins=True,
                            strat='quantile',
                            n_bins=10):

  """Returns the Calibration curve and the bins given some risk scores.

  Accepts the output of a trained survival model at a certain evaluation time,
  the event indicators and protected group membership and outputs an IPCW
  adjusted calibration curve.

  Args:
    out:
      risk scores P(T>t) issued by a trained survival analysis model
      (output of fair_survival_analysis.models.predict_survival).
    e:
      a numpy vector of indicators specifying is event or censoring occured.
    t:
      a numpy vector of times at which the events or censoring occured.
    a:
      a numpy vector of protected attributes.
    group:
      string indicating the demogrpahic to evaluate calibration for.
    eval_time:
      float/int of the event time at which calibration is to be evaluated. Must
      be same as the time at which the Risk Scores were issues.
    typ:
      Determines if the calibration curves are to be computed on the individuals
      that experienced the event or adjusted estimates for individuals that are
      censored using IPCW estimator on a population or subgroup level
    ret_bins:
      Boolean that specifies if the bins of the calibration curve are to be
      returned.
    strat:
      Specifies how the bins are computed. One of:
      "quantile": Equal sized bins.
      "uniform": Uniformly stratified.
    n_bins:
      int specifying the number of bins to use to compute the ece.
  Returns:
    Calibration Curve: A tuple of True Probality, Estimated Probability in
    each bin and the estimated Expected Calibration Error.

  """

  if typ == 'IPCWpop':
    kmf = KaplanMeierFitter().fit(t, 1 - e)

  else:
    t_ = t[a == group]
    e_ = e[a == group]

    kmf = KaplanMeierFitter().fit(t_, 1 - e_)

  out_ = out.copy()

  e = e[a == group]
  t = t[a == group]
  out = out[a == group]

  y = t > eval_time

  if strat == 'quantile':

    quantiles = [(1. / n_bins) * i for i in range(n_bins + 1)]
    outbins = np.quantile(out, quantiles)

  if strat == 'uniform':

    binlen = (out.max() - out.min()) / n_bins
    outbins = [out.min() + i * binlen for i in range(n_bins + 1)]

  prob_true = []
  prob_pred = []

  ece = 0

  for n_bin in range(n_bins):

    binmin = outbins[n_bin]
    binmax = outbins[n_bin + 1]

    scorebin = (out >= binmin) & (out <= binmax)

    weight = float(len(scorebin)) / len(out)

    out_ = out[scorebin]
    y_ = y[scorebin]

    y_ = y_ / kmf.predict(eval_time)

    pred = y_.mean()

    prob_true.append(pred)

    prob_pred.append(out_.mean())

    gap = abs(prob_pred[-1] - prob_true[-1])

    ece += weight * gap

  if ret_bins:
    return prob_true, prob_pred, outbins, ece

  else:
    return prob_true, prob_pred, ece
Example #31
0
def plot_survival(unique_groups, grouped_data, analysis_type, censors, ci, showplot, stat_results, time='Months'):
	#plot survival curve
	kmf = KaplanMeierFitter()
	fig, ax = plt.subplots()
	n_in_groups = []

	f = open('Kaplan_%s.txt' % (analysis_type), 'a')
	f.write("\nPercent %s\n" % analysis_type)
	headers = "Group\t"
	for x in range(95,-1,-5):
		headers += str(x) + "%\t"
	f.write("%s\n" % headers)


	for i, group in enumerate(unique_groups):
		data = grouped_data.get_group(group)
		n_in_groups.append(len(data))
		# Adjust survival data from days to whatever form wanted
		if time.lower() == 'months':
			survival_time = (data['survival']/(365/12))
		elif time.lower() == 'years':
			survival_time = (data['survival']/(365))
		else:
			survival_time = data['survival']
		kmf.fit(survival_time, data['event'], label = group)
		# print(data[survival])

		# print(kmf.survival_function_)
		f.write("%s\t" % group)
		for x in range(95, -1, -5):
			f.write(str(qth_survival_times(x/100, kmf.survival_function_)) + "\t")
		f.write("\n")	

		kmf.plot(ax=ax, show_censors=censors, ci_show=ci, linewidth=2.5)

	# Make the graph pretty!
	textbox = dict(horizontalalignment = 'left', verticalalignment = 'bottom', fontname = 'Arial', fontsize = 18)
	labels = dict(horizontalalignment = 'center', verticalalignment = 'center', fontname = 'Arial', fontsize = 28)

	ax.grid(False)
	ax.set_ylim(0,1.05)
	ax.spines['left'].set_linewidth(2.5)
	ax.spines['right'].set_linewidth(2.5)
	ax.spines['top'].set_linewidth(2.5)
	ax.spines['bottom'].set_linewidth(2.5)
	ax.yaxis.set_tick_params(width=2.5)
	ax.xaxis.set_tick_params(width=2.5)
	ax.xaxis.set_ticks_position('bottom')
	ax.yaxis.set_ticks_position('left')

	# plt.title('%s' % (analysis_type), labels, y = 1.05)
	plt.xlabel('%s Post-Diagnosis' % time, labels, labelpad = 20)
	if analysis_type == 'survival':
		plt.ylabel('Overall Survival', labels, labelpad = 20)
	else:
		plt.ylabel('Relapse-Free Survival', labels, labelpad=20)
	plt.xticks(fontname = 'Arial', fontsize = 24)
	plt.yticks(fontname = 'Arial', fontsize = 24)
	ax.tick_params(axis='y', pad=10)
	ax.tick_params(axis='x', pad=10)


	legend = ax.legend(frameon=False,loc=3)
	counter=0
	for label in legend.get_texts():
		label.set_fontsize(20)
		label.set_text('%s   n=%d' % (unique_groups[counter], n_in_groups[counter]))
		counter += 1

	if len(unique_groups) == 2:	
		plt.text(0.95, 0.05, 'p = %.2g' % (stat_results.p_value), fontname='Arial', fontsize=20, ha='right', transform=ax.transAxes)

	plt.tight_layout()


	fig.savefig('Kaplan_%s.png' % analysis_type, transparent = True)
	fig.savefig('Kaplan_%s.eps' % analysis_type, transparent = True)
	if showplot == True:
		plt.show()
	plt.close(fig)
Example #32
0
lat_0 = clin[clin['latitude_raw'] == 0]
lat_1 = clin[clin['latitude_raw'] == 1]
lat_23 = clin[clin['latitude_raw'] >= 2]

# =============================================================================
# Prepare plots
# =============================================================================

fig1, ax = plt.subplots(1, figsize=(2.5,2.5))

# =============================================================================
# Plot ctDNA below median on kmf1
# =============================================================================

kmf1 = KaplanMeierFitter()
color = 'grey'
defective_patient_number = str(len(lat_0))
defective_label = str(str('Lat. 0 ')+r"(n="+defective_patient_number+")")
T = lat_0['Time_to_CRPC'].round(3)
C = lat_0['crpc_status'].astype(np.int32)
kmf1.fit(T, event_observed = C, label = defective_label)
kmf1.plot(ax=ax,show_censors = True, ci_show = False, color = color, lw = 1)
# lat_0_median=kmf1.median


# =============================================================================
# Plot ctDNA above median on kmf2
# =============================================================================

kmf2 = KaplanMeierFitter()
Example #33
0
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import scipy

import lifelines

figsize(12.5,5)
np.set_printoptions(precision=2, suppress=True)

from lifelines import KaplanMeierFitter
survival_times = np.array([0.,3.,4.5, 10., 1.])
events = np.array([False, True, True, False, True])

kmf = KaplanMeierFitter()
kmf.fit(survival_times, event_observed=events)

print kmf.survival_function_
print kmf.median_
kmf.plot()


## example 2
import matplotlib.pylab as plt
%pylab

figsize(12.5,6)
from lifelines.plotting import plot_lifetimes
from numpy.random import uniform, exponential
N = 25
Example #34
0
# Loading the the survival un-employment data
Patient_data = pd.read_csv("C:/Users/hp/Desktop/survival assi/Patient.csv")
Patient_data.head()
Patient_data.describe()

Patient_data["Followup"].describe()

# Followup is referring to time
T = Patient_data.Followup

# Importing the KaplanMeierFitter model to fit the survival analysis
from lifelines import KaplanMeierFitter

# Initiating the KaplanMeierFitter model
kmf = KaplanMeierFitter()

# Fitting KaplanMeierFitter model on Time and Events for death
kmf.fit(T, event_observed=Patient_data.Eventtype)

# Time-line estimations plot
kmf.plot()

Patient_data.PatientID.value_counts()
# Applying KaplanMeierFitter model on Time and Events
kmf.fit(
    T[Patient_data.PatientID == [
        'Joe', 'Jess', 'Ann', 'Mary', 'Frank', 'Steven', 'Andy', 'Elizabeth',
        'Joe', 'Kate'
    ]], Patient_data.Eventtype[Patient_data.PatientID == [
        'Joe', 'Jess', 'Ann', 'Mary', 'Frank', 'Steven', 'Andy', 'Elizabeth',
Example #35
0
def make_figure(df, pa):
    df_ls = df.copy()

    durations = df_ls[pa["xvals"]]
    event_observed = df_ls[pa["yvals"]]

    km = KaplanMeierFitter()  ## instantiate the class to create an object

    pl = None
    fig = plt.figure(frameon=False,
                     figsize=(float(pa["fig_width"]), float(pa["fig_height"])))

    ## Fit the data into the model

    if str(pa["groups_value"]) == "None":
        km.fit(durations, event_observed, label='Kaplan Meier Estimate')

        df_survival = km.survival_function_
        df_conf = km.confidence_interval_
        df_event = km.event_table

        df = pd.merge(df_survival,
                      df_conf,
                      how='left',
                      left_index=True,
                      right_index=True)
        df = pd.merge(df,
                      df_event,
                      how='left',
                      left_index=True,
                      right_index=True)

        df['time'] = df.index.tolist()
        df = df.reset_index(drop=True)
        df = df[[
            "time", "at_risk", "removed", "observed", "censored", "entrance",
            "Kaplan Meier Estimate", "Kaplan Meier Estimate_lower_0.95",
            "Kaplan Meier Estimate_upper_0.95"
        ]]

        pa_ = {}
        for arg in [
                "Conf_Interval", "show_censors", "ci_legend", "ci_force_lines",
                "left_axis", "right_axis", "upper_axis", "lower_axis",
                "tick_left_axis", "tick_right_axis", "tick_upper_axis",
                "tick_lower_axis"
        ]:
            if pa[arg] in ["off", ".off"]:
                pa_[arg] = False
            else:
                pa_[arg] = True

        if str(pa["markerc_write"]) != "":
            pa_["marker_fc"] = pa["markerc_write"]
        else:
            pa_["marker_fc"] = pa["markerc"]

        if str(pa["edgecolor_write"]) != "":
            pa_["marker_ec"] = pa["edgecolor_write"]
        else:
            pa_["marker_ec"] = pa["edgecolor"]

        if str(pa["grid_color_text"]) != "":
            pa_["grid_color_write"] = pa["grid_color_text"]
        else:
            pa_["grid_color_write"] = pa["grid_color_value"]

        pl=km.plot(show_censors=pa_["show_censors"], \
                censor_styles={"marker":marker_dict[pa["censor_marker_value"]], "markersize":float(pa["censor_marker_size_val"]), "markeredgecolor":pa_["marker_ec"], "markerfacecolor":pa_["marker_fc"], "alpha":float(pa["marker_alpha"])}, \
               ci_alpha=float(pa["ci_alpha"]), \
               ci_force_lines=pa_["ci_force_lines"], \
               ci_show=pa_["Conf_Interval"], \
               ci_legend=pa_["ci_legend"], \
               linestyle=pa["linestyle_value"], \
               linewidth=float(pa["linewidth"]), \
               color=pa["line_color_value"])

        pl.spines['right'].set_visible(pa_["right_axis"])
        pl.spines['top'].set_visible(pa_["upper_axis"])
        pl.spines['left'].set_visible(pa_["left_axis"])
        pl.spines['bottom'].set_visible(pa_["lower_axis"])

        pl.spines['right'].set_linewidth(pa["axis_line_width"])
        pl.spines['left'].set_linewidth(pa["axis_line_width"])
        pl.spines['top'].set_linewidth(pa["axis_line_width"])
        pl.spines['bottom'].set_linewidth(pa["axis_line_width"])

        pl.tick_params(axis="both",
                       direction=pa["ticks_direction_value"],
                       length=float(pa["ticks_length"]))

        pl.tick_params(axis='x',
                       which='both',
                       bottom=pa_["tick_lower_axis"],
                       top=pa_["tick_upper_axis"],
                       labelbottom=pa_["lower_axis"],
                       labelrotation=float(pa["xticks_rotation"]),
                       labelsize=float(pa["xticks_fontsize"]))

        pl.tick_params(axis='y',
                       which='both',
                       left=pa_["tick_left_axis"],
                       right=pa_["tick_right_axis"],
                       labelleft=pa_["left_axis"],
                       labelrotation=float(pa["yticks_rotation"]),
                       labelsize=float(pa["yticks_fontsize"]))

        if str(pa["grid_value"]) != "None":
            pl.grid(True,
                    which='both',
                    axis=pa["grid_value"],
                    color=pa_["grid_color_write"],
                    linewidth=float(pa["grid_linewidth"]))

        if str(pa["x_lower_limit"]) != "" and str(pa["x_upper_limit"]) != "":
            pl.set_xlim(float(pa["x_lower_limit"]), float(pa["x_upper_limit"]))
        if str(pa["y_lower_limit"]) != "" and str(pa["y_upper_limit"]) != "":
            pl.set_ylim(float(pa["y_lower_limit"]), float(pa["y_upper_limit"]))

        pl.set_title(pa["title"], fontdict={'fontsize': float(pa['titles'])})
        pl.set_xlabel(pa["xlabel"],
                      fontdict={'fontsize': float(pa['xlabels'])})
        pl.set_ylabel(pa["ylabel"],
                      fontdict={'fontsize': float(pa['ylabels'])})

        return df, pl

    elif str(pa["groups_value"]) != "None":

        df_long = pd.DataFrame(
            columns=['day', 'status', str(pa["groups_value"])])

        for row in range(0, len(df_ls)):

            if int(df_ls.loc[row, pa["yvals"]]) >= 1:
                dead = int(df_ls.loc[row, pa["yvals"]])
                #print(dead)
                for i in range(0, dead):
                    #print(i)
                    df_long = df_long.append(
                        {
                            'day':
                            int(df_ls.loc[row, pa["xvals"]]),
                            'status':
                            1,
                            str(pa["groups_value"]):
                            str(df_ls.loc[row, pa["groups_value"]])
                        },
                        ignore_index=True)
                    i = i + 1

            elif int(df_ls.loc[row, pa["censors_val"]]) >= 1:
                censored = int(df_ls.loc[row, pa["censors_val"]])
                #print(censored)
                for c in range(0, censored):
                    #print(c)
                    df_long = df_long.append(
                        {
                            'day':
                            int(df_ls.loc[row, pa["xvals"]]),
                            'status':
                            0,
                            str(pa["groups_value"]):
                            str(df_ls.loc[row, pa["groups_value"]])
                        },
                        ignore_index=True)
                    c = c + 1

        df_dummy = pd.get_dummies(df_long,
                                  drop_first=True,
                                  columns=[pa["groups_value"]])

        results = logrank_test(df_dummy.loc[df_dummy['status'] == 1,
                                            'day'].tolist(),
                               df_dummy.loc[df_dummy['status'] == 0,
                                            'day'].tolist(),
                               df_dummy.loc[df_dummy['status'] == 1,
                                            'status'].tolist(),
                               df_dummy.loc[df_dummy['status'] == 0,
                                            'status'].tolist(),
                               alpha=.99)

        cph = CoxPHFitter()
        cph.fit(df_dummy, duration_col='day', event_col='status')

        cph_coeff = cph.summary
        cph_coeff = cph_coeff.reset_index()

        df_info = {}
        df_info['model'] = 'lifelines.CoxPHFitter'
        df_info['duration col'] = cph.duration_col
        df_info['event col'] = cph.event_col
        df_info['baseline estimation'] = 'breslow'
        df_info['number of observations'] = cph._n_examples
        df_info['number of events observed'] = len(
            df_dummy.loc[df_dummy['status'] == 1, ])
        df_info['partial log-likelihood'] = cph.log_likelihood_
        df_info['Concordance'] = cph.concordance_index_
        df_info['Partial AIC'] = cph.AIC_partial_
        df_info['log-likelihood ratio test'] = cph.log_likelihood_ratio_test(
        ).test_statistic
        df_info[
            'P.value(log-likelihood ratio test)'] = cph.log_likelihood_ratio_test(
            ).p_value
        df_info['log rank test'] = results.test_statistic
        df_info['P.value(log rank test)'] = results.p_value

        cph_stats = pd.DataFrame(df_info.items())
        cph_stats = cph_stats.rename(columns={0: 'Statistic', 1: 'Value'})
        #cph_stats

        tmp = []

        for cond in pa["list_of_groups"]:
            df_tmp = df_ls.loc[df_ls[pa["groups_value"]] == cond]

            km.fit(df_tmp[pa["xvals"]], df_tmp[pa["yvals"]], label=cond)

            df_survival = km.survival_function_
            df_conf = km.confidence_interval_
            df_event = km.event_table

            df = pd.merge(df_survival,
                          df_conf,
                          how='left',
                          left_index=True,
                          right_index=True)
            df = pd.merge(df,
                          df_event,
                          how='left',
                          left_index=True,
                          right_index=True)

            df['time'] = df.index.tolist()
            df = df.reset_index(drop=True)
            df = df.rename(
                columns={
                    "at_risk": cond + "_at_risk",
                    "removed": cond + "_removed",
                    "observed": cond + "_observed",
                    "censored": cond + "_censored",
                    "entrance": cond + "_entrance",
                    cond: cond + "_KMestimate"
                })

            df = df[[
                "time", cond + "_at_risk", cond + "_removed",
                cond + "_observed", cond + "_censored", cond + "_entrance",
                cond + "_KMestimate", cond + "_lower_0.95",
                cond + "_upper_0.95"
            ]]
            tmp.append(df)

            df = reduce(lambda df1, df2: pd.merge(df1, df2, on='time'), tmp)

            PA_ = [g for g in pa["groups_settings"] if g["name"] == cond][0]

            if str(PA_["linecolor_write"]) != "":
                linecolor = PA_["linecolor_write"]
            else:
                linecolor = PA_["line_color_value"]

            if str(PA_["linestyle_write"]) != "":
                linestyle = PA_["linestyle_write"]
            else:
                linestyle = PA_["linestyle_value"]

            if str(PA_["markerc_write"]) != "":
                markerColor = PA_["markerc_write"]
            else:
                markerColor = PA_["markerc"]

            if str(PA_["edgecolor_write"]) != "":
                edgeColor = PA_["edgecolor_write"]
            else:
                edgeColor = PA_["edgecolor"]

            if PA_["show_censors"] in ["off", ".off"]:
                showCensors = False
            else:
                showCensors = True

            if PA_["Conf_Interval"] in ["off", ".off"]:
                ConfidenceInterval = False
            else:
                ConfidenceInterval = True

            if PA_["ci_legend"] in ["off", ".off"]:
                CI_legend = False
            else:
                CI_legend = True

            if PA_["ci_force_lines"] in ["off", ".off"]:
                CI_lines = False
            else:
                CI_lines = True

            linewidth = PA_["linewidth_write"]
            edgeLineWidth = PA_["edge_linewidth"]
            markerSize = PA_["censor_marker_size_val"]

            markerAlpha = PA_["marker_alpha"]
            CI_alpha = PA_["ci_alpha"]
            markerVal = PA_["censor_marker_value"]

            pa_ = {}
            for arg in [
                    "left_axis", "right_axis", "upper_axis", "lower_axis",
                    "tick_left_axis", "tick_right_axis", "tick_upper_axis",
                    "tick_lower_axis"
            ]:
                if pa[arg] in ["off", ".off"]:
                    pa_[arg] = False
                else:
                    pa_[arg] = True

            if str(pa["grid_color_text"]) != "":
                pa_["grid_color_write"] = pa["grid_color_text"]
            else:
                pa_["grid_color_write"] = pa["grid_color_value"]

            pl=km.plot(show_censors=showCensors, \
                censor_styles={"marker":marker_dict[markerVal], "markersize":float(markerSize), "markeredgecolor":edgeColor, "markerfacecolor":markerColor, "alpha":float(markerAlpha), "mew":float(edgeLineWidth)}, \
                ci_alpha=float(CI_alpha), \
                ci_force_lines=CI_lines, \
                ci_show=ConfidenceInterval, \
                ci_legend=CI_legend, \
                linestyle=linestyle, \
                linewidth=float(linewidth), \
                color=linecolor)

            pl.spines['right'].set_visible(pa_["right_axis"])
            pl.spines['top'].set_visible(pa_["upper_axis"])
            pl.spines['left'].set_visible(pa_["left_axis"])
            pl.spines['bottom'].set_visible(pa_["lower_axis"])

            pl.spines['right'].set_linewidth(pa["axis_line_width"])
            pl.spines['left'].set_linewidth(pa["axis_line_width"])
            pl.spines['top'].set_linewidth(pa["axis_line_width"])
            pl.spines['bottom'].set_linewidth(pa["axis_line_width"])

            pl.tick_params(axis="both",
                           direction=pa["ticks_direction_value"],
                           length=float(pa["ticks_length"]))

            pl.tick_params(axis='x',
                           which='both',
                           bottom=pa_["tick_lower_axis"],
                           top=pa_["tick_upper_axis"],
                           labelbottom=pa_["lower_axis"],
                           labelrotation=float(pa["xticks_rotation"]),
                           labelsize=float(pa["xticks_fontsize"]))

            pl.tick_params(axis='y',
                           which='both',
                           left=pa_["tick_left_axis"],
                           right=pa_["tick_right_axis"],
                           labelleft=pa_["left_axis"],
                           labelrotation=float(pa["yticks_rotation"]),
                           labelsize=float(pa["yticks_fontsize"]))

            if str(pa["grid_value"]) != "None":
                pl.grid(True,
                        which='both',
                        axis=pa["grid_value"],
                        color=pa_["grid_color_write"],
                        linewidth=float(pa["grid_linewidth"]))

            if str(pa["x_lower_limit"]) != "" and str(
                    pa["x_upper_limit"]) != "":
                pl.set_xlim(float(pa["x_lower_limit"]),
                            float(pa["x_upper_limit"]))
            if str(pa["y_lower_limit"]) != "" and str(
                    pa["y_upper_limit"]) != "":
                pl.set_ylim(float(pa["y_lower_limit"]),
                            float(pa["y_upper_limit"]))

            pl.set_title(pa["title"],
                         fontdict={'fontsize': float(pa['titles'])})
            pl.set_xlabel(pa["xlabel"],
                          fontdict={'fontsize': float(pa['xlabels'])})
            pl.set_ylabel(pa["ylabel"],
                          fontdict={'fontsize': float(pa['ylabels'])})

        return df, pl, cph_coeff, cph_stats
# Add lab number
training_time['lab_number'] = training_time.lab.map(institution_map()[0])
training_time = training_time.sort_values('lab_number')

# %% PLOT

# Set figure style and color palette
use_palette = [[0.6, 0.6, 0.6]] * len(np.unique(training_time['lab']))
use_palette = use_palette + [[1, 1, 0.2]]
lab_colors = group_colors()

# Plot hazard rate survival analysis
f, (ax1) = plt.subplots(1, 1, figsize=(FIGURE_WIDTH/3, FIGURE_HEIGHT))

kmf = KaplanMeierFitter()
for i, lab in enumerate(np.unique(training_time['lab_number'])):
    kmf.fit(training_time.loc[training_time['lab_number'] == lab, 'sessions'].values,
            event_observed=training_time.loc[training_time['lab_number'] == lab, 'trained'])
    ax1.step(kmf.cumulative_density_.index.values, kmf.cumulative_density_.values,
             color=lab_colors[i])
kmf.fit(training_time['sessions'].values, event_observed=training_time['trained'])
# ax1.step(kmf.cumulative_density_.index.values, kmf.cumulative_density_.values, color='black')
ax1.set(ylabel='Cumulative probability of\nreaching trained criterion', xlabel='Training day',
        xlim=[0, 60], ylim=[0, 1.02])
ax1.set_title('All labs: %d mice'%training_time['nickname'].nunique())

# kmf.fit(training_time['sessions'].values, event_observed=training_time['trained'])
# kmf.plot_cumulative_density(ax=ax2)
# ax2.set(ylabel='Cumulative probability of\nreaching trained criterion', xlabel='Training day',
#         title='All labs', xlim=[0, 60], ylim=[0, 1.02])
Example #37
0
#batsmen_data = data

#data.to_csv('data.csv', sep=',')
#print(batsmen_data)

#----------------------------------(i) Player's Country vs Career Length -------------------------------------------------------

data = pd.read_csv("data.csv")
data.ix[:, 'censor'] = 1

data = pd.DataFrame(data)
duration = data['span']
observed = data.ix[:, 'censor']

kmf = KaplanMeierFitter()
kmf.fit(duration, observed, label='kmf_mean')
#kmf.plot()
#plt.show()

###INDIA kmf

india_data = data.ix[data['country'] == 'INDIA']
india_duration = india_data['span']
india_observed = india_data['censor']

kmfind = KaplanMeierFitter()
kmfind.fit(india_duration, india_observed, label="india")

###simillarly for other countries
kmfpak = KaplanMeierFitter()
Example #38
0
def Compute_Pvalue(df,
                   df2,
                   end_date,
                   start_date=0):  #df=bio data, #df2=survival data
    measures_list = measures['measure_key'].tolist()
    df_stats = pd.DataFrame()
    df_logrank = pd.DataFrame()
    df_stats_final = pd.DataFrame()
    df_logrank_final = pd.DataFrame()

    for m in measures_list:
        number = 0
        numberp = 0
        print(m)
        if (m != 30):
            d = filter_data(df, m, 0, end_date, 2, ['1', '2'])
            #d['patient_key']=d['patient_key'].astype(int)
            measure_name = measures[measures['measure_key'] ==
                                    m]['measure_name']
            d['measure_name'] = measure_name.values[0]
            if d.empty == False:
                #filename='C:/Users/akaic/bio/data_bio_measure_'+str(measure_name.values[0])+'.csv'
                #Join patients with their survival data
                data = surviv_data.merge(d, how='inner', on='patient_key')
                # Create survival model here
                groups = ['1', '2']
                ###
                #Save infos
                NbP = 0
                try:
                    for g in groups:
                        group1 = data[data['group'] == g]
                        T = group1['Duration']
                        E = group1['death']
                        kmf_1 = KaplanMeierFitter().fit(T,
                                                        E,
                                                        label="Group " +
                                                        str(g))
                        median1 = kmf_1.median_survival_time_
                        nb1 = data[data['group'] == g].shape[0]
                        df_stats.loc[number,
                                     'group'] = 'group_' + str(g) + '_' + str(
                                         measure_name.values[0])
                        df_stats.loc[number, 'median'] = median1
                        df_stats.loc[number, 'nombre'] = nb1
                        df_stats.loc[number,
                                     'measure'] = (measure_name.values[0])

                        #print(d[1,'breaks'][0])
                        #df_stats.at[number,'breaks']=str(d[1,'breaks'][0])
                        number = number + 1
                        NbP = NbP + nb1
                    if (NbP >= 20):
                        print('Yes')
                        p_value = multivariate_logrank_test(
                            data['Duration'], data['group'],
                            data['death']).p_value
                        #df_logrank.loc[numberp,'breaks'].applymap(lambda x: d.iloc[1]['breaks'])
                        df_logrank.loc[numberp, 'pvalue'] = p_value
                        df_logrank.loc[numberp,
                                       'measure'] = (measure_name.values[0])
                        df_logrank.loc[numberp, 'start_date'] = 0
                        df_logrank.loc[numberp, 'end_date'] = end_date

                        df_stats_final = df_stats_final.append(
                            df_stats, ignore_index=True)
                        df_logrank_final = df_logrank_final.append(
                            df_logrank, ignore_index=True)
                except:
                    pass

    return df_stats_final, df_logrank_final
Example #39
0
import pandas as pd
from lifelines import KaplanMeierFitter
from pylab import show

df = pd.read_excel('lapse-data-pure.xlsx')
df = df[df['Duration Days'] != 0]

T = df['Duration Months'].apply(lambda x: 0 if x < 0 else x)
E = T.apply(lambda x: True if x > 0 else False)
df['Random Class'] = df['Random Class'].apply(lambda x: 'A'
                                              if x >= 0.5 else 'B')

#df.to_excel('lapse-data-ready.xlsx')

groups = df['Random Class']
ix = (groups == 'A')

kmf = KaplanMeierFitter()
kmf.fit(T[ix], event_observed=E[ix],
        label='Class A')  # or, more succiently, kmf.fit(T, E)

ax = kmf.plot()

ix = (groups == 'B')

kmf.fit(T[ix], event_observed=E[ix], label='Class B')

ax = kmf.plot(ax=ax)

show()
Example #40
0
def survival_difference_at_fixed_point_in_time_test(point_in_time,
                                                    durations_A,
                                                    durations_B,
                                                    event_observed_A=None,
                                                    event_observed_B=None,
                                                    **kwargs):
    """
    
    Often analysts want to compare the survival-ness of groups at specific times, rather than comparing the entire survival curves against each other. 
    For example, analysts may be interested in 5-year survival. Statistically comparing the naive Kaplan-Meier points at a specific time
    actually has reduced power (see [1]). By transforming the Kaplan-Meier curve, we can recover more power. This function uses 
    the log(-log) transformation. 


    Parameters
    ----------
    point_in_time: float,
        the point in time to analyze the survival curves at. 

    durations_A: iterable
        a (n,) list-like of event durations (birth to death,...) for the first population.

    durations_B: iterable
        a (n,) list-like of event durations (birth to death,...) for the second population.

    event_observed_A: iterable, optional
        a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the first population. 
        Default assumes all observed.

    event_observed_B: iterable, optional
        a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the second population. 
        Default assumes all observed.

    kwargs: 
        add keywords and meta-data to the experiment summary


    Returns
    -------

    results : StatisticalResult
      a StatisticalResult object with properties 'p_value', 'summary', 'test_statistic', 'print_summary'

    Examples
    --------
    >>> T1 = [1, 4, 10, 12, 12, 3, 5.4]
    >>> E1 = [1, 0, 1,  0,  1,  1, 1]
    >>>
    >>> T2 = [4, 5, 7, 11, 14, 20, 8, 8]
    >>> E2 = [1, 1, 1, 1,  1,  1,  1, 1]
    >>>
    >>> from lifelines.statistics import survival_difference_at_fixed_point_in_time_test
    >>> results = survival_difference_at_fixed_point_in_time_test(12, T1, T2, event_observed_A=E1, event_observed_B=E2)
    >>>
    >>> results.print_summary()
    >>> print(results.p_value)        # 0.893
    >>> print(results.test_statistic) # 0.017

    Notes
    -----
    Other transformations are possible, but Klein et al. [1] showed that the log(-log(c)) transform has the most desirable
    statistical properties. 

    [1] Klein, J. P., Logan, B. , Harhoff, M. and Andersen, P. K. (2007), Analyzing survival curves at a fixed point in time. Statist. Med., 26: 4505-4519. doi:10.1002/sim.2864
    
    """

    kmfA = KaplanMeierFitter().fit(durations_A,
                                   event_observed=event_observed_A)
    kmfB = KaplanMeierFitter().fit(durations_B,
                                   event_observed=event_observed_B)

    sA_t = kmfA.predict(point_in_time)
    sB_t = kmfB.predict(point_in_time)

    # this is doing a prediction/interpolation between the kmf's index.
    sigma_sqA = dataframe_interpolate_at_times(kmfA._cumulative_sq_,
                                               point_in_time)
    sigma_sqB = dataframe_interpolate_at_times(kmfB._cumulative_sq_,
                                               point_in_time)

    log = np.log
    clog = lambda s: log(-log(s))

    X = (clog(sA_t) - clog(sB_t))**2 / (sigma_sqA / log(sA_t)**2 +
                                        sigma_sqB / log(sB_t)**2)
    p_value = chisq_test(X, 1)

    return StatisticalResult(p_value,
                             X,
                             null_distribution="chi squared",
                             degrees_of_freedom=1,
                             point_in_time=point_in_time,
                             **kwargs)
Example #41
0
    def main(self, durations: List[pd.DataFrame],
             categories: List[pd.DataFrame],
             event_observed: List[pd.DataFrame],
             estimator: str,
             id_filter: List[str],
             subsets: List[List[str]]) -> dict:
        # TODO: Docstring
        if len(durations) != 1:
            error = 'Analysis requires exactly one array that specifies the ' \
                    'duration length.'
            logger.exception(error)
            raise ValueError(error)
        if len(event_observed) > 1:
            error = 'Maximal one variable for "event_observed" allowed'
            logger.exception(error)
            raise ValueError(error)

        df = durations[0]
        df.dropna(inplace=True)
        df = utils.apply_id_filter(df=df, id_filter=id_filter)
        df = utils.apply_subsets(df=df, subsets=subsets)
        df = utils.apply_categories(df=df, categories=categories)

        stats = {}
        categories = df['category'].unique().tolist()
        subsets = df['subset'].unique().tolist()
        # for every category and subset combination estimate the survival fun.
        for category in categories:
            for subset in subsets:
                sub_df = df[(df['category'] == category) &
                            (df['subset'] == subset)]
                T = sub_df['value']
                E = None  # default is nothing is censored
                if len(T) <= 3:
                    continue
                if event_observed:
                    # find observation boolean value for every duration
                    E = event_observed[0].merge(sub_df, how='right', on='id')
                    E = [not x for x in pd.isnull(E['value_x'])]
                    assert len(E) == len(T)
                if estimator == 'NelsonAalen':
                    fitter = NelsonAalenFitter()
                    fitter.fit(durations=T, event_observed=E)
                    estimate = fitter.cumulative_hazard_[
                        'NA_estimate'].tolist()
                    ci_lower = fitter.confidence_interval_[
                        'NA_estimate_lower_0.95'].tolist()
                    ci_upper = fitter.confidence_interval_[
                        'NA_estimate_upper_0.95'].tolist()
                elif estimator == 'KaplanMeier':
                    fitter = KaplanMeierFitter()
                    fitter.fit(durations=T, event_observed=E)
                    # noinspection PyUnresolvedReferences
                    estimate = fitter.survival_function_[
                        'KM_estimate'].tolist()
                    ci_lower = fitter.confidence_interval_[
                        'KM_estimate_lower_0.95'].tolist()
                    ci_upper = fitter.confidence_interval_[
                        'KM_estimate_upper_0.95'].tolist()
                else:
                    error = 'Unknown estimator: {}'.format(estimator)
                    logger.exception(error)
                    raise ValueError(error)
                timeline = fitter.timeline.tolist()
                if not stats.get(category):
                    stats[category] = {}
                stats[category][subset] = {
                    'timeline': timeline,
                    'estimate': estimate,
                    'ci_lower': ci_lower,
                    'ci_upper': ci_upper
                }

        return {
            'label': df['feature'].tolist()[0],
            'categories': categories,
            'subsets': subsets,
            'stats': stats
        }
Example #42
0
for i, t in enumerate(x):
    y[i] = naive_estimator(t, data)

plt.plot(x, y, label="Naive")

x, y = HomemadeKM(data)
plt.step(x, y, label="Kaplan-Meier")
plt.xlabel("Time")
plt.ylabel("Survival probability estimate")
plt.legend()
plt.show()

# We want to compare the survival functions of these two groups.
# Now, use the `KaplanMeierFitter` class from `lifelines`. Run the next cell to fit and plot the Kaplan Meier curves for each group.
S1 = data[data.Stage_group == 1]
km1 = KM()
km1.fit(S1.loc[:, 'Time'],
        event_observed=S1.loc[:, 'Event'],
        label='Stage III')

S2 = data[data.Stage_group == 2]
km2 = KM()
km2.fit(S2.loc[:, "Time"], event_observed=S2.loc[:, 'Event'], label='Stage IV')

ax = km1.plot(ci_show=False)
km2.plot(ax=ax, ci_show=False)
plt.xlabel('time')
plt.ylabel('Survival probability estimate')
plt.savefig('two_km_curves', dpi=300)

# Let's compare the survival functions at 90, 180, 270, and 360 days
Example #43
0
})
time_data.to_csv('results/running_time_coxph.csv', index=False)

###############################
##Extra code: calibration plots

#Cox model calibration train set
y_pred = cph.predict_survival_function(data_train)
times = y_pred.index.values
y_pred = y_pred.as_matrix().transpose()
cuts = np.concatenate(
    (np.array([-1e6]), np.percentile(y_pred[:, 1],
                                     [25, 50, 75]), np.array([1e6])))
bin = pd.cut(y_pred[:, 1], cuts, labels=False)

kmf = KaplanMeierFitter()
for which_bin in range(max(bin) + 1):
    kmf.fit(data_train.time.iloc[bin == which_bin],
            event_observed=data_train.dead.iloc[bin == which_bin])
    plt.plot(kmf.survival_function_.index.values,
             kmf.survival_function_.KM_estimate,
             color='k')
    pred_surv = np.mean(y_pred[bin == which_bin, :], axis=0)
    plt.plot(times, pred_surv, 'b-')

plt.xticks(np.arange(0, 365 * 5, 365))
plt.yticks(np.arange(0, 1.0001, 0.125))
plt.xlim([0, 365.25 * 5])
plt.ylim([0, 1])
plt.gca().set_position([0.1, 0.1, .8, .8])
plt.show()
    ).assign(
        published_date=lambda x: x.published_date.fillna(date.today())).assign(
            time_to_published=lambda x: pd.to_datetime(
                x.published_date) - pd.to_datetime(x.posted_date)))
preprints_w_published_dates = preprints_w_published_dates[
    preprints_w_published_dates.time_to_published > pd.Timedelta(0)].dropna()
print(preprints_w_published_dates.shape)
preprints_w_published_dates.head()

# # Calculate Overall Survival Function

# This section loads up the KaplanMeier Estimator for preprints. It measures the lifetime of unpublished preprints. Overtime preprints start to become published which is what decreases the population size.

# In[5]:

kmf = KaplanMeierFitter()

# In[6]:

kmf.fit(
    preprints_w_published_dates["time_to_published"].dt.total_seconds() / 60 /
    60 / 24,
    event_observed=~preprints_w_published_dates["published_doi"].isna(),
)

# In[7]:

kmf.median_survival_time_

# In[8]:
Example #45
0
def cluster_KMplot(cluster_assign,
                   clin_data_fn,
                   delimiter='\t',
                   lr_test=True,
                   tmax=-1,
                   verbose=True,
                   **save_args):
    title = 'KM Survival Plot'
    if 'job_name' in save_args:
        title = save_args['job_name'] + ' KM Survival Plot'

    # Initialize KM plotter
    kmf = KaplanMeierFitter()
    # Load and format clinical data
    surv = pd.read_csv(clin_data_fn, sep=delimiter, index_col=0)
    # Number of clusters
    clusters = sorted(list(cluster_assign.value_counts().index))
    k = len(clusters)
    # Initialize KM Plot Settings
    fig = plt.figure(figsize=(10, 7))
    ax = plt.subplot(1, 1, 1)
    colors = sns.color_palette('hls', k)
    cluster_cmap = {clusters[i]: colors[i] for i in range(k)}
    # Plot each cluster onto KM Plot
    for clust in clusters:
        clust_pats = list(cluster_assign[cluster_assign == clust].index)
        clust_surv_data = surv.ix[clust_pats].dropna()
        kmf.fit(clust_surv_data.overall_survival,
                clust_surv_data.vital_status,
                label='Group ' + str(clust) + ' (n=' +
                str(len(clust_surv_data)) + ')')
        kmf.plot(ax=ax, color=cluster_cmap[clust], ci_show=False)
    # Set KM plot limits to 5 years and labels
    # if tmax!=-1:
    plt.xlim((0, 1825))
    plt.xlabel('Time (Days)', fontsize=16)
    plt.ylabel('Survival Probability', fontsize=16)
    # Multivariate logrank test
    if lr_test:
        cluster_survivals = pd.concat([surv, cluster_assign],
                                      axis=1).dropna().astype(int)
        p = multiv_lr_test(np.array(cluster_survivals.overall_survival),
                           np.array(cluster_survivals[cluster_assign.name]),
                           t_0=tmax,
                           event_observed=np.array(
                               cluster_survivals.vital_status)).p_value
        if verbose:
            print 'Multi-Class Log-Rank P:', p
        plt.title(title + '\np=' + repr(round(p, 4)), fontsize=24, y=1.02)
    else:
        plt.title(title, fontsize=24, y=1.02)
    # Save KM plot
    if 'outdir' in save_args:
        if 'job_name' in save_args:
            save_KMplot_path = save_args['outdir'] + str(
                save_args['job_name']) + '_KM_plot.png'
        else:
            save_KMplot_path = save_args['outdir'] + 'KM_plot.png'
        plt.savefig(save_KMplot_path, bbox_inches='tight')
        plt.show()
    if verbose:
        print 'Kaplan Meier Plot constructed'
    if lr_test:
        return p
    else:
        return
Example #46
0
'''
    T  E    group
0   6  1  miR-137
1  13  1  miR-137
2  13  1  miR-137
3  13  1  miR-137
4  19  1  miR-137
'''

T = df['T']
E = df['E']

groups = df['group']
ix = (groups == 'miR-137')

kmf = KaplanMeierFitter()

kmf.fit(T[~ix], E[~ix], label='control')
ax = kmf.plot()

kmf.fit(T[ix], E[ix], label='miR-137')
kmf.plot(ax=ax)

plt.ylabel('Survival Probability')
outFile = 'lifelines_survival.png'
ISP_mystyle.showData(outFile)

# Compare the two curves
results = logrank_test(T[ix],
                       T[~ix],
                       event_observed_A=E[ix],
def get_kmf_fit(qs):
    t = qs.values_list('days_since_complaint', flat=True)
    c = qs.values_list('is_closed', flat=True)
    kmf = KaplanMeierFitter()
    kmf.fit(t, event_observed=c)
    return kmf
Example #48
0
# We use a Kaplan-Meier estimator or a product limit estimator. Non-parametric statistic to estimate survival from
# lifetime data.
import pandas as pd
from lifelines.datasets import load_dd

data = load_dd()
data.sample(2)
# the boolean columns `observed` refers to whether the death (leaving office)
# was observed or not.

# 'Observed' then tells us whether or not something is right-censored?

# For this example we'll use KaplanMeier but you can also use BreslowFlemingHarringtonFitter, WeibullFitter or
# ExponentialFitter
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

# "For this estimation, we need the duration each leader was/has been in office, and whether or not
# they were observed to have left office (leaders who died in office or were in office in 2008, the latest date this
# data was record at, do not have observed death events)"

# How the KaplanMeierFitter works:
#
# KaplanMeierFitter.fit(durations, event_observed=None,
#                       timeline=None, entry=None, label='KM_estimate',
#                       alpha=None, left_censorship=False, ci_labels=None)
#
# Parameters:
#   duration: an array, or pd.Series, of length n -- duration subject was observed for
#   timeline: return the best estimate at the values in timelines (postively increasing)
#   event_observed: an array, or pd.Series, of length n -- True if the the death was observed, False if the event
Example #49
0
def _plot_kmf_single(df,
                     condition_col,
                     survival_col,
                     censor_col,
                     threshold,
                     title,
                     xlabel,
                     ylabel,
                     ax,
                     with_condition_color,
                     no_condition_color,
                     with_condition_label,
                     no_condition_label,
                     color_map,
                     label_map,
                     color_palette,
                     ci_show,
                     print_as_title):
    """
    Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col.

    All inputs are required - this function is intended to be called by `plot_kmf`.
    """
    # make color inputs consistent hex format
    if colors.is_color_like(with_condition_color):
        with_condition_color = colors.to_hex(with_condition_color)
    if colors.is_color_like(no_condition_color):
        no_condition_color = colors.to_hex(no_condition_color)
    ## prepare data to be plotted; producing 3 outputs:
    # - `condition`, series containing category labels to be plotted
    # - `label_map` (mapping condition values to plot labels)
    # - `color_map` (mapping condition values to plotted colors)
    if threshold is not None:
        is_median = threshold == "median"
        if is_median:
            threshold = df[condition_col].median()
        label_suffix = float_str(threshold)
        condition = df[condition_col] > threshold
        default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix)
        if is_median:
            label_suffix += " (median)"
        default_label_with_condition = "%s > %s" % (condition_col, label_suffix)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category":
        condition = df[condition_col].astype("category")
        if not label_map:
            label_map = dict()
            [label_map.update({condition_value: '{} = {}'.format(condition_col,
                                                        condition_value)})
                     for condition_value in condition.unique()]
        if not color_map:
            rgb_values = sb.color_palette(color_palette, len(label_map.keys()))
            hex_values = [colors.to_hex(col) for col in rgb_values]
            color_map = dict(zip(label_map.keys(), hex_values))
    elif df[condition_col].dtype == 'bool':
        condition = df[condition_col]
        default_label_with_condition = "= {}".format(condition_col)
        default_label_no_condition = "¬ {}".format(condition_col)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    else:
        raise ValueError('Don\'t know how to plot data of type\
                         {}'.format(df[condition_col].dtype))

    # produce kmf plot for each category (group) identified above
    kmf = KaplanMeierFitter()
    grp_desc = list()
    grp_survival_data = dict()
    grp_event_data = dict()
    grp_names = list(condition.unique())
    for grp_name, grp_df in df.groupby(condition):
        grp_survival = grp_df[survival_col]
        grp_event = (grp_df[censor_col].astype(bool))
        grp_label = label_map[grp_name]
        grp_color = color_map[grp_name]
        kmf.fit(grp_survival, grp_event, label=grp_label)
        desc_str = "# {}: {}".format(grp_label, len(grp_survival))
        grp_desc.append(desc_str)
        grp_survival_data[grp_name] = grp_survival
        grp_event_data[grp_name] = grp_event
        if ax:
            ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color)
        else:
            ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color)

    ## format the plot
    # Set the y-axis to range 0 to 1
    ax.set_ylim(0, 1)
    y_tick_vals = ax.get_yticks()
    ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals])
    # plot title
    if title:
        ax.set_title(title)
    elif print_as_title:
        ax.set_title(' | '.join(grp_desc))
    else:
        [print(desc) for desc in grp_desc]
    # axis labels
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    
    ## summarize analytical version of results
    ## again using same groups as are plotted
    if len(grp_names) == 2:
        # use log-rank test for 2 groups
        results = logrank_test(grp_survival_data[grp_names[0]],
                               grp_survival_data[grp_names[1]],
                               event_observed_A=grp_event_data[grp_names[0]],
                               event_observed_B=grp_event_data[grp_names[1]])
    elif len(grp_names) == 1:
        # no analytical result for 1 or 0 groups
        results = NullSurvivalResults()
    else:
        # cox PH fitter for >2 groups
        cf = CoxPHFitter()
        cox_df = patsy.dmatrix('+'.join([condition_col, survival_col,
                                         censor_col]),
                               df, return_type='dataframe')
        del cox_df['Intercept']
        results = cf.fit(cox_df, survival_col, event_col=censor_col)
        results.print_summary()
    # add metadata to results object so caller can print them
    results.survival_data_series = grp_survival_data
    results.event_data_series = grp_event_data
    results.desc = grp_desc
    return results
from lifelines.statistics import logrank_test

def seen_death(row):
    recent = datetime.datetime(2014,9,15) - datetime.timedelta(days=int(df["max_interval"].mean()))
    return row["dates"][-1] < recent


connection = pymongo.MongoClient('localhost', 27017)
communities = connection.database_names()

for db in ["gender", "admin", "local", "visualizations", "results"]:
	if db in communities:communities.remove(db)

results_db = connection['results']['question_3']

kmf = KaplanMeierFitter()

for community in communities:

	community_db = connection[community]['statistics']
	cursor = community_db.find({'contributions_total': {'$gt':0},  'gender': {'$ne': "Unknown"} }, 
                       			{u'_id': False, 'lifetime': True, 'max_interval': True, 
                       				u'gender':True, 'activity_freq': True, 'dates':True} )

	df =  pandas.DataFrame(list(cursor))

	df["dead"] = df.apply(seen_death, axis=1)

	males = df[df['gender']=='Male']
	females = df[df['gender']=='Female']
Example #51
0
def plot_kmf(df, 
             condition_col, 
             censor_col, 
             survival_col, 
             threshold=None,
             title=None,
             xlabel=None,
             ax=None,
             print_as_title=False):
    """
    Plot survival curves by splitting the dataset into two groups based on
    condition_col

    if threshold is defined, the groups are split based on being > or <
    condition_col

    if threshold == 'median', the threshold is set to the median of condition_col

    Parameters
    ----------
        df: dataframe
        condition_col: string, column which contains the condition to split on
        survival_col: string, column which contains the survival time
        censor_col: string,
        threshold: int or string, if int, condition_col is thresholded,
                                  if 'median', condition_col thresholded 
                                  at its median
        title: Title for the plot, default None
        ax: an existing matplotlib ax, optional, default None
        print_as_title: bool, optional, whether or not to print text
          within the plot's title vs. stdout, default False
    """
    kmf = KaplanMeierFitter()
    if threshold is not None:
        if threshold == 'median':
            threshold = df[condition_col].median()
        condition = df[condition_col] > threshold
        label = '{} > {}'.format(condition_col, threshold)
    else:
        condition = df[condition_col]
        label = '{}'.format(condition_col)

    df_with_condition = df[condition]
    df_no_condition = df[~condition]
    survival_no_condition = df_no_condition[survival_col]
    survival_with_condition = df_with_condition[survival_col]

    event_no_condition = (df_no_condition[censor_col].astype(bool))
    event_with_condition = (df_with_condition[censor_col].astype(bool))
             
    kmf.fit(survival_no_condition, event_no_condition, label="")
    if ax:
        kmf.plot(ax=ax, show_censors=True, ci_show=False)
    else:
        ax = kmf.plot(show_censors=True, ci_show=False)

    kmf.fit(survival_with_condition, event_with_condition, label=(label))
    kmf.plot(ax=ax, show_censors=True, ci_show=False)

    # Set the y-axis to range 0 to 1
    ax.set_ylim(0, 1)

    no_cond_str = "# no condition {}".format(len(survival_no_condition))
    cond_str = "# with condition {}".format(len(survival_with_condition))
    if title:
        ax.set_title(title)
    elif print_as_title:
        ax.set_title("%s | %s" % (no_cond_str, cond_str))
    else:
        print(no_cond_str)
        print(cond_str)

    if xlabel:
        ax.set_xlabel(xlabel)
 
    results = logrank_test(survival_no_condition, 
                           survival_with_condition, 
                           event_observed_A=event_no_condition, 
                           event_observed_B=event_with_condition)
    return results
Example #52
0
		return t
	elif is_number(c['year_of_birth']) == True and is_number(c['age_at_diagnosis']) == True and is_number(c['days_to_death']) == False:
		t = 2018 - float(c['year_of_birth']) - (float(c['age_at_diagnosis'])*4/(365*3 + 366))
		return t
	else:
		return "NotApplicable"

matrix['duration'] = matrix.apply(duration, axis = 1)
matrix['event'] = matrix.apply(event, axis = 1)
matrix = matrix[['bcr_sample_barcode', 'duration', 'event']]
#new_header = matrix.iloc[0] #grab the first row for the header
#matrix = matrix[1:] #take the data less the header row
#matrix.columns = new_header
matrix = matrix[matrix['duration']!="NotApplicable"]


kmf = KaplanMeierFitter()
kmf.fit(durations = matrix.duration, event_observed = matrix.event)

kmf.survival_function_

# plot the KM estimate
kmf.plot()
# Add title and y-axis label
plt.title("The Kaplan-Meier Estimate for BRCA (total)")
plt.ylabel("Probability a patient is still active")

plt.show()


print(df.head())
'''
    T  E    group
0   6  1  miR-137
1  13  1  miR-137
2  13  1  miR-137
3  13  1  miR-137
4  19  1  miR-137
'''

T = df['T']
E = df['E']

groups = df['group']
ix = (groups == 'miR-137')

kmf = KaplanMeierFitter()

kmf.fit(T[~ix], E[~ix], label='control')
ax = kmf.plot()

kmf.fit(T[ix], E[ix], label='miR-137')
kmf.plot(ax=ax)

plt.ylabel('Survival Probability')
plt.show()

# Compare the two curves
results = logrank_test(T[ix], T[~ix], event_observed_A=E[ix], event_observed_B=E[~ix])
results.print_summary()
Example #54
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter

#kmf object for use throughout
kmf = KaplanMeierFitter()

#### Cleaning Procedure-------------------------

#Lodaing csv from local machine (see repository for data, originally from Kaggle)
ans = pd.read_csv("~\\Answers.csv", encoding='latin-1')
qus = pd.read_csv("~\Questions.csv", encoding='latin-1')

#Reducing answers to best score for each question
ans = ans.sort_values(['ParentId', 'Score'], ascending=[True, False])
ans = ans.drop_duplicates(subset='ParentId')

#Merging questions and answers together using left outer-join
sf = pd.merge(qus, ans, how='left', left_on='Id', right_on='ParentId')

# Altering the answer scores for modeling. For our purposes, we will have any question with 3 or few votes be considered "unanswered"
sf['event'] = sf.Score_y >= 3

#Creating time variables, which will show # of hours it takes for a question to receive its highest score.
sf['ans_date'] = pd.to_datetime(sf['CreationDate_y'])
sf['ask_date'] = pd.to_datetime(sf['CreationDate_x'])
sf['duration'] = sf['ans_date'] - sf['ask_date']
sf['duration_min'] = sf['duration'].dt.total_seconds() / 60
sf['duration_hr'] = sf["duration_min"] / 60
def kmplot(df_high, df_low, ax):
	kmf_high = KaplanMeierFitter()
	kmf_low = KaplanMeierFitter()
	try:
		kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high)))
		kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low)))
	except ValueError:
		return("NA", "0", "0", "0", "0")
	kmf_high.plot(ax = ax, color = "red", show_censors=True,  ci_show=False)
	kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False)
	statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event)
	p_value = statistics_result.p_value
	ax.set_xlabel('Time (months)')
	ax.set_ylabel('Probability')
	ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes,
        color = 'black', fontsize = 11)
	plt.legend(loc=3)
	hm5 = kmf_high.predict(60)
	hm10 = kmf_high.predict(120)
	lm5 = kmf_low.predict(60)
	lm10 = kmf_low.predict(120)
	return(p_value, hm5, hm10, lm5, lm10)
Example #56
0
# Censor some observations
cutoff = 30 # Generate a censor length
cutoff = np.repeat(cutoff, N) 
duration = np.minimum(event_t,cutoff) # "Cut-off" observations over cutoff level
not_censor = event_t <= duration  # generate a boolean indicator of censoring
not_censor = not_censor.astype(int) # convert boolean to zeroes and ones

# Convert to data frame
data = pd.DataFrame({'duration': duration, 'event': not_censor, 'age': age, 'college': college})

# Plot observations with censoring
# plot_lifetimes(duration, event_observed = not_censor)

# Kaplan Meier Summary for Simulated Data
from lifelines import KaplanMeierFitter
kmf =  KaplanMeierFitter()
kmf.fit(duration, event_observed = not_censor)
kmf.survival_function_.plot()

# Cox-PH Model Regression
from lifelines import CoxPHFitter
cf = CoxPHFitter()
cf.fit(data, 'duration', event_col = 'event')
cf.print_summary()

## Get Predictions from Model ##

# 24 year old college grad
#college_24 = pd.DataFrame({'age':[24], 'college':[1]})
#cf.predict_survival_function(college_24).plot()
Example #57
0
preds_bootfull_mat = np.concatenate(preds_bootfull, axis=1)
inds_inbag_mat = np.array(inds_inbag).T
inbag_mask = 1 * np.array([
    np.any(inds_inbag_mat == _, axis=0) for _ in range(inds_inbag_mat.shape[0])
])
preds_bootave_oob = np.divide(
    np.sum(np.multiply((1 - inbag_mask), preds_bootfull_mat), axis=1),
    np.sum(1 - inbag_mask, axis=1))
risk_groups = 1 * (preds_bootave_oob > np.median(preds_bootave_oob))

wdf = pd.DataFrame(np.concatenate(
    (y_orig, preds_bootave_oob[:, np.newaxis], risk_groups[:, np.newaxis]),
    axis=-1),
                   columns=['status', 'time', 'preds', 'risk_groups'],
                   index=[str(_) for _ in risk_groups])

kmf = KaplanMeierFitter()
ax = plt.subplot(111)
kmf.fit(durations=wdf.loc['0', 'time'],
        event_observed=wdf.loc['0', 'status'],
        label="Low Risk")
ax = kmf.plot(ax=ax)
kmf.fit(durations=wdf.loc['1', 'time'],
        event_observed=wdf.loc['1', 'status'],
        label="High Risk")
ax = kmf.plot(ax=ax)
plt.ylim(0, 1)
plt.title("Kaplan-Meier Plots")
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
Example #58
0
 def kmf(self):
     return KaplanMeierFitter()
Example #59
0
early_stopping = EarlyStopping(monitor='loss', patience=2)
history = model.fit(x_train,
                    y_train,
                    batch_size=256,
                    epochs=100000,
                    callbacks=[early_stopping])
y_pred = model.predict_proba(x_train, verbose=0)

#Example of finding model-predicted survival probability.
#Predicted survival prob. for first individual at follow-up time of 30 days:
pred_surv = nnet_survival.nnet_pred_surv(
    model.predict_proba(x_train, verbose=0), breaks, 30)
print(pred_surv[0])

#Plot predicted vs. actual survival
kmf = KaplanMeierFitter()
kmf.fit(t, event_observed=f)
plt.plot(breaks, np.concatenate(([1], np.cumprod(y_pred[0, :]))), 'bo-')
plt.plot(kmf.survival_function_.index.values,
         kmf.survival_function_.KM_estimate,
         color='k')
plt.xlabel('Follow-up time (days)')
plt.ylabel('Proportion surviving')
plt.title(
    'All patients from same survival distribution, no censoring. Actual=black, predicted=blue.'
)
plt.show()

############################################################################
#Flexible model (non-proportional hazards).
#All pts with same exponential survival distribution, some patients censored
Example #60
0
def execute():
    matplotlib.rc("font", size=20)

    engine, session = database.initialize("sqlite:///../data/isrid-master.db")

    # Query with Group.size may take awhile, at least for Charles
    # Not sure why
    query = session.query(Incident.total_hours, Subject.survived, Group.category, Group.size).join(Group, Subject)
    print("Tabulating query... may take awhile for unknown reasons.")
    df = tabulate(query)
    print("Done tabulating.")
    print(df.describe())
    database.terminate(engine, session)

    df = df.assign(
        days=[total_hours.total_seconds() / 3600 / 24 for total_hours in df.total_hours],
        doa=[not survived for survived in df.survived],
    )
    df = df[0 <= df.days]

    rows, columns = 2, 2
    grid, axes = plt.subplots(rows, columns, figsize=(15, 10))

    categories = Counter(df.category)
    plot = 0
    kmfs = []
    options = {"show_censors": True, "censor_styles": {"marker": "|", "ms": 6}, "censor_ci_force_lines": False}

    for category, count in categories.most_common()[: rows * columns]:
        print("Category:", category)
        ax = axes[plot // columns, plot % columns]
        df_ = df[df.category == category]
        N, Ndoa = len(df_), sum(df_.doa)
        Srate = 100 * (1 - Ndoa / N)
        grp = df_[df_.size > 1]
        sng = df_[df_.size == 1]
        kmf = KaplanMeierFitter()
        # kmf.fit(df_.days, event_observed=df_.doa, label=category)
        # kmf.plot(ax=ax, ci_force_lines=True)
        kmf.fit(grp.days, event_observed=grp.doa, label=category + " Groups")
        kmf.plot(ax=ax, **options)
        kmf.fit(sng.days, event_observed=sng.doa, label=category + " Singles")
        kmf.plot(ax=ax, **options)
        kmfs.append(kmf)

        ax.set_xlim(0, min(30, 1.05 * ax.get_xlim()[1]))
        ax.set_ylim(0, 1)
        ax.set_title("{}, N = {}, DOA = {}, {:.0f}% surv".format(category, N, Ndoa, Srate))
        ax.set_xlabel("Total Incident Time (days)")
        ax.set_ylabel("Probability of Survival")

        # ax.legend_.remove()
        # ax.grid(True)

        plot += 1

    grid.suptitle("Kaplan-Meier Survival Curves", fontsize=25)
    grid.tight_layout()
    grid.subplots_adjust(top=0.9)
    grid.savefig("../doc/figures/kaplan-meier/km-grid-large.svg", transparent=True)

    combined = plt.figure(figsize=(15, 10))
    ax = combined.add_subplot(1, 1, 1)
    for kmf in kmfs[: rows * columns]:
        kmf.plot(ci_show=False, show_censors=True, censor_styles={"marker": "|", "ms": 6}, ax=ax)

    ax.set_xlim(0, 15)
    ax.set_ylim(0, 1)
    ax.set_xlabel("Total Incident Time (days)")
    ax.set_ylabel("Probability of Survival")
    ax.set_title("Kaplan-Meier Survival Curves", fontsize=25)
    ax.grid(True)
    combined.savefig("../doc/figures/kaplan-meier/km-combined-large.svg", transparent=True)

    plt.show()