def survival_ll_nelson_aalen(content): naf = NelsonAalenFitter() naf.fit(content['times'], event_observed=content['events']) return httpWrapper( json.dumps({ 'hazard': naf.cumulative_hazard_.to_dict(), 'confidence': naf.confidence_interval_.to_dict() }, ignore_nan=True ))
def concat_hazard_curve(T, C): naf = NelsonAalenFitter(nelson_aalen_smoothing=False) naf.fit(T, event_observed=C) #return naf.smoothed_hazard_(bandwidth=bandwidth).reindex(range(1,max_idx+1))['differenced-NA_estimate'].values return naf.cumulative_hazard_.reindex( 1, args.max_idx + 1).values, naf.confidence_interval_.reindex( 1, args, max_idx + 1).values
def NelsonAelan_dash(T, C): naf = NelsonAalenFitter() naf.fit(T, event_observed=C) naf.plot(title='Nelson-Aalen Estimate') naf.plot(ci_force_lines=True, title='Nelson-Aalen Estimate') py_p = plt.gcf() pyplot(py_p, legend=False)
def _vval2ByBootstrap(timeline, nstraps=1000): sa1_b, sa2_b = np.zeros((timeline.shape[0], nstraps)), np.zeros( (timeline.shape[0], nstraps)) for sampi in range(nstraps): tmp = df.sample(frac=1, replace=True, axis=0) ind1 = tmp[treatment_col] == 0 naf1 = NelsonAalenFitter() naf1.fit(durations=tmp.loc[ind1, duration_col], event_observed=tmp.loc[ind1, event_col]) sa1 = np.exp(-naf1.cumulative_hazard_.iloc[:, 0]) sa1 = sa1.reindex(timeline, method='ffill') sa1_b[:, sampi] = sa1.values ind2 = df[treatment_col] == 1 naf2 = NelsonAalenFitter() naf2.fit(durations=tmp.loc[ind2, duration_col], event_observed=tmp.loc[ind2, event_col]) sa2 = np.exp(-naf2.cumulative_hazard_.iloc[:, 0]) sa2 = sa2.reindex(timeline, method='ffill') sa2_b[:, sampi] = sa2.values vval2 = 1 / np.sqrt( np.nanvar(np.log(sa1_b), axis=1) + np.nanvar(np.log(sa2_b), axis=1)) return vval2
def calcSurvHazardCat(df: pd.DataFrame, *, hazardcol: str = "hazard",) -> pd.DataFrame: """ Calculate cumulative hazard survived for each individual patient, as an alternative to raw (and often censored) survival time. Parameters ---------- df A data frame with two compulsory columns: time and event. hazardcol Column name for the survived hazard. Returns ------- The input dataframe, with an extra column of hazards. """ ### Fit survival Nelson-Aalen Estimator of Hazard on survival data T = df["time"] E = df["event"] naf = NelsonAalenFitter() naf.fit(T, E) df[hazardcol] = naf.predict(T).tolist() return df
def test_naf_plot_cumulative_hazard_bandwith_1(self, block): data1 = np.random.exponential(5, size=(2000, 1)) ** 2 naf = NelsonAalenFitter() naf.fit(data1) naf.plot_hazard(bandwidth=5.0, iloc=slice(0, 1700)) self.plt.title("test_naf_plot_cumulative_hazard_bandwith_1") self.plt.show(block=block) return
def createHazardGraph(durations, event_observed): naf = NelsonAalenFitter() naf.fit(durations, event_observed) naf.plot(ci_show=False) plt.title("Hard Drive Nelson-Aalen Hazard Estimate") plt.ylabel("Cumulative Hazard") plt.show()
def test_naf_plot_cumulative_hazard(self, block): data1 = np.random.exponential(5, size=(200, 1)) naf = NelsonAalenFitter() naf.fit(data1) ax = naf.plot() naf.plot_cumulative_hazard(ax=ax, ci_force_lines=True) self.plt.title("I should have plotted the same thing, but different styles + color!") self.plt.show(block=block) return
def survival_ll_nelson_aalen(content): kmf = NelsonAalenFitter() kmf.fit(content['times'], event_observed=content['events']) return httpWrapper( json.dumps({ 'result': kmf.survival_function_, 'hazard': cumulative_hazard_, 'median': kmf.kmf.median_ }))
def fit( self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series, of length n duration subject was observed for timeline: return the best estimate at the values in timelines (positively increasing) event_observed: an array, or pd.Series, of length n True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: string a string to name the column of the estimate. alpha: float, optional (default=0.05) the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: iterable add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self, with new properties like ``survival_function_``. """ self._label = coalesce(label, self._label, "BFH_estimate") alpha = coalesce(alpha, self.alpha) naf = NelsonAalenFitter(alpha=alpha) naf.fit(durations, event_observed=event_observed, timeline=timeline, label=self._label, entry=entry, ci_labels=ci_labels) self.durations, self.event_observed, self.timeline, self.entry, self.event_table, self.weights = ( naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table, naf.weights, ) # estimation self.survival_function_ = np.exp(-naf.cumulative_hazard_) self.confidence_interval_ = np.exp(-naf.confidence_interval_) self.confidence_interval_survival_function_ = self.confidence_interval_ self.confidence_interval_cumulative_density = 1 - self.confidence_interval_ # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" # plotting functions self.plot_survival_function = self.plot return self
def test_naf_plotting_with_custom_colours(self, block): data1 = np.random.exponential(5, size=(200, 1)) data2 = np.random.exponential(1, size=(500)) naf = NelsonAalenFitter() naf.fit(data1) ax = naf.plot(color="r") naf.fit(data2) naf.plot(ax=ax, color="k") self.plt.title("test_naf_plotting_with_custom_coloirs") self.plt.show(block=block) return
def test_naf_plotting_slice(self, block): data1 = np.random.exponential(5, size=(200, 1)) data2 = np.random.exponential(1, size=(200, 1)) naf = NelsonAalenFitter() naf.fit(data1) ax = naf.plot(loc=slice(0, None)) naf.fit(data2) naf.plot(ax=ax, ci_force_lines=True, iloc=slice(100, 180)) self.plt.title("test_naf_plotting_slice") self.plt.show(block=block) return
def _fit_kaplan_meier(self): """ private method to fit Kaplan-Meier curve """ if self.kmf_fit is not None: # already fitted return # Overall kmf_fit = KaplanMeierFitter() kmf_fit.fit(self.time, event_observed=self.event, label=self.label) naf_case = NelsonAalenFitter() naf_case.fit(self.time, event_observed=self.event, label=self.label) self.kmf_fit = kmf_fit self.naf_fit = naf_case
def go(): print args T_all, C_all = concat_TC(all_files) T_m, C_m = concat_TC(files_m) T_f, C_f = concat_TC(files_f) for gender, (T, C) in zip(('all', 'm', 'f'), ((T_all, C_all), (T_m, C_m), (T_f, C_f))): naf = NelsonAalenFitter(nelson_aalen_smoothing=False) naf.fit(T, event_observed=C) dill.dump( naf, open( '/backup/home/jared/storage/foraging/cm/{}_{}_shuffle_{}_{}_{}' .format(gender, args.mode, args.min_length, args.ignore_first, args.memory), 'wb'))
def _estimateSurv(df, ind): naf = NelsonAalenFitter() naf.fit(durations=df.loc[ind, duration_col], event_observed=df.loc[ind, event_col]) """Borrowed from lifelines""" timeline = sorted(naf.timeline) deaths = naf.event_table['observed'] """Slowest line here.""" population = naf.event_table['entrance'].cumsum() - naf.event_table['removed'].cumsum().shift(1).fillna(0) varsa = np.cumsum(_additive_var(population, deaths)) varsa = varsa.reindex(timeline, method='pad') varsa.index.name = 'timeline' varsa.name = 'surv_var' sa = np.exp(-naf.cumulative_hazard_.iloc[:, 0]) sa.name = 'surv' return naf, sa, varsa
def get_hazard_ratio_results(df, group_col, time_col, event_col): models = [] summary_ = None summary_result = None df = df[[event_col, time_col, group_col]].dropna() df[event_col] = df[event_col].astype('category') df[event_col] = df[event_col].cat.codes df[time_col] = df[time_col].astype('float') if not df.empty: for name, grouped_df in df.groupby(group_col): hr = NelsonAalenFitter() t = grouped_df[time_col] e = grouped_df[event_col] hr.fit(t, event_observed=e, label=name + " (N=" + str(len(t.tolist())) + ")") models.append(hr) return models
def _vval2ByBootstrap(timeline, nstraps=1000): sa1_b, sa2_b = np.zeros((timeline.shape[0], nstraps)), np.zeros((timeline.shape[0], nstraps)) for sampi in range(nstraps): tmp = df.sample(frac=1, replace=True, axis=0) ind1 = tmp[treatment_col] == 0 naf1 = NelsonAalenFitter() naf1.fit(durations=tmp.loc[ind1, duration_col], event_observed=tmp.loc[ind1, event_col]) sa1 = np.exp(-naf1.cumulative_hazard_.iloc[:, 0]) sa1 = sa1.reindex(timeline, method='ffill') sa1_b[:, sampi] = sa1.values ind2 = df[treatment_col] == 1 naf2 = NelsonAalenFitter() naf2.fit(durations=tmp.loc[ind2, duration_col], event_observed=tmp.loc[ind2, event_col]) sa2 = np.exp(-naf2.cumulative_hazard_.iloc[:, 0]) sa2 = sa2.reindex(timeline, method='ffill') sa2_b[:, sampi] = sa2.values vval2 = 1/np.sqrt(np.nanvar(np.log(sa1_b), axis=1) + np.nanvar(np.log(sa2_b), axis=1)) return vval2
def plot_HR(df, with_ci=False): T = df['days_survived'] E = df['death'] naf = NelsonAalenFitter() cutoff = np.percentile(df['risk'], 75) high_risk = df['risk'] > cutoff naf.fit(T[high_risk], event_observed=E[high_risk], label='High_Risk') ax = naf.plot(ci_show=with_ci) naf.fit(T[~high_risk], event_observed=E[~high_risk], label='Low_Risk') naf.plot(ax=ax, ci_show=with_ci) plt.ylim(0, .1) plt.xlabel("Days") plt.ylabel("Risk of Death") plt.title("Cardiovascular Death Risk over time (top quartile)") if with_ci: plt.savefig("./hr_with_ci.png") else: plt.savefig("./hr_without_ci.png")
def get_sa(request): dirname = os.path.dirname(os.path.dirname(__file__)).replace('\\', '/') kmffile = '/images/test1.jpg' naffile = '/images/test2.jpg' context = {} context['kmf'] = kmffile context['naf'] = naffile if not os.path.exists(dirname + kmffile) and not os.path.exists(dirname + naffile): df = load_waltons() T = df['T'] # an array of durations E = df['E'] # a either boolean or binary array representing whether the 'death' was observed (alternatively an individual can be censored) kmf = KaplanMeierFitter(alpha=0.95) kmf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='KM_estimate', alpha=None, left_censorship=False, ci_labels=None) naf = NelsonAalenFitter(alpha=0.95, nelson_aalen_smoothing=True) naf.fit(durations=T, event_observed=E, timeline=None, entry=None, label='NA_estimate', alpha=None, ci_labels=None) kmf.plot() plt.savefig(dirname + kmffile) naf.plot() plt.savefig(dirname + naffile) # return render_to_response(template_name='sa_test.html', context=context, context_instance=RequestContext(request=request)) return render(request=request, template_name='sa_test.html', context=context)
data_events = np.append(data_events,np.array([time_to_event]*num_repair)) for v in sales_dict.values(): #investigate why some negative leftovers on certain valid dates , more repairs than sales ??? if v>0: data_events = np.append(data_events,np.zeros(v)) t=[] if len(data_events)==0: all_data.append([0]*19) continue data_events[data_events==0] = 160 C= data_events <160 naf = NelsonAalenFitter() naf.fit(data_events, censorship=C ) y_h = np.array(naf.cumulative_hazard_).reshape(len(naf.cumulative_hazard_)) x= np.array(naf.cumulative_hazard_.index).astype(int) seen_data_events.add(0) seen_data_events.add(160) if len(y_h) > 14: slope, intercept, r_value, p_value, std_err = stats.linregress(x[len(x)-5:len(x)-1],y_h[len(y_h)-5:len(y_h)-1]) #plt.figure() #plt.plot(x, y_h, 'ko') #plt.plot(x, linear_f(x,slope,intercept ), 'r-') #plt.legend() #plt.show()
term_bandwidths = [4., 8.] #list of NAF smoothing bandwidth (for each term) naf = NelsonAalenFitter(nelson_aalen_smoothing=False) #init NAF model all_hazards = {} #initialize dict to store hazard functions for idx,term in enumerate(keep_terms): #compute all hazard functions for each term cur_data = LD[LD.term==term] lifetimes = cur_data['num_pymnts'].copy() #lifetime is number of payments received lifetimes.ix[cur_data.loan_status == 'Fully Paid'] = term #if the loan is fully paid set the lifetime to the full term is_observed = cur_data.loan_status.isin(['Charged Off']) #observed loans are just the ones that have been charged off, rest are censored all_hazards[term] = np.zeros((len(keep_grades),term+1)) #initialize matrix of hazard functions for gidx,grade in enumerate(keep_grades): #fit model for each grade grade_data = cur_data.grade == grade naf.fit(lifetimes[grade_data],event_observed=is_observed[grade_data],label=grade,timeline=np.arange(term+1)) all_hazards[term][gidx,:] = naf.smoothed_hazard_(term_bandwidths[idx]).squeeze() #%% terms = LD.term.unique() #set of unique loan terms for term in terms: #for each possible loan term #get relevant set of loans cur_loans = LD.term == term cur_LD = LD[cur_loans] (NAR, net_returns, p_csum) = LCH.get_NARs(cur_LD, term) LD.ix[cur_loans,'ROI'] = NAR #measured performance of each loan LD.ix[cur_loans,'net_returns'] = net_returns #principal weighted avg monthly returns LD.ix[cur_loans,'prnc_weight'] = p_csum #principal weighted avg monthly returns LD.ix[cur_loans,'default_prob'] = LD.ix[cur_loans,'is_observed'].astype(float) #principal weighted avg monthly returns
def test_exponential_data_sets_fit(): N = 20000 T, C = exponential_survival_data(N, 0.2, scale=10) naf = NelsonAalenFitter() naf.fit(T, C).plot() plt.title("Should be a linear with slope = 0.1")
plt.title(dept) plt.xlim(0, 1000) if i == 0: plt.ylabel('Frac. in staying after $n$ years') plt.tight_layout() for i, dept in enumerate(depts): ix = data['dept'] == dept kmf.fit(T[ix], E[ix], label=dept) print(dept, kmf.median_) # Looking at a hazard curve from lifelines import NelsonAalenFitter naf = NelsonAalenFitter() naf.fit(T, event_observed=E) print(naf.cumulative_hazard_.head()) naf.plot() # This hazard curve shows us that there is low hazard of someone leaving starting off, then it gets worse, # once you stay for 500 days you stay at least a bit more, then exponentially it gets worse! # SURVIVAL REGRESSION -- figuring out the influences of other aspects on whether or not someone survives # Can't use regular linear regression. Want to use Cox's model or Aalen's additive model. # Cox's Proportional Hazard model # "The idea behind the model is that the log-hazard of an individual is a linear function of their static covariates # and a population-level baseline hazard that changes over time" - from https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html from lifelines.datasets import load_rossi from lifelines import CoxPHFitter
for r in cac_ranges: ix = cac_values == r if first == 0: kmf.fit(times[ix], censors[ix], label=r) ax = kmf.plot() first = 1 else: kmf.fit(times[ix], censors[ix], label=r) kmf.plot(ax=ax) elif curve == 'hazard': # Plot hazard curve naf = NelsonAalenFitter() first = 0 for r in cac_ranges: ix = cac_values == r if first == 0: naf.fit(times[ix], censors[ix], label=r) ax = naf.plot() first = 1 else: naf.fit(times[ix], censors[ix], label=r) naf.plot(ax=ax) ax.set_ylabel("%", fontsize=12) ax.set_title(tag, fontsize=14) ax.set_xlabel("Years to event", fontsize=12) return times
if data == 'colon': data = pd.read_csv('../data/colon') data = data[data.etype == 2] data['age_band'] = pd.qcut(data.age, 4) print(data.head()) age_bands = data.age_band.unique().sort_values() print('age bands', age_bands) ax = plt.subplot() for i in range(4): mask = data.age_band == age_bands[i] print('num individuals in age band', age_bands[i], 'equals', np.sum(mask)) naf = NelsonAalenFitter() fitted = naf.fit(data.loc[mask, 'time'], data.loc[mask, 'status'], label='cum_hazard') cum_hazard_df = fitted.cumulative_hazard_ cum_hazard = cum_hazard_df['cum_hazard'].to_numpy() times = cum_hazard_df.index.to_numpy() ax = plt.plot(times, cum_hazard, label='Q' + str(i+1), linestyle=linestyles[i]) print(f'i plus 1 is {i+1}, and age band {age_bands[i]}') plt.legend() plt.xlabel('Time (in days)') plt.ylabel('Cumulative hazard') plt.tight_layout() plt.savefig('cumulative_hazard_colon.pdf') plt.show() # # # # loan_bands = data['loan_band'].unique()
def main(self, durations: List[pd.DataFrame], categories: List[pd.DataFrame], event_observed: List[pd.DataFrame], estimator: str, id_filter: List[str], subsets: List[List[str]]) -> dict: # TODO: Docstring if len(durations) != 1: error = 'Analysis requires exactly one array that specifies the ' \ 'duration length.' logger.exception(error) raise ValueError(error) if len(event_observed) > 1: error = 'Maximal one variable for "event_observed" allowed' logger.exception(error) raise ValueError(error) df = durations[0] df.dropna(inplace=True) df = utils.apply_id_filter(df=df, id_filter=id_filter) df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_categories(df=df, categories=categories) stats = {} categories = df['category'].unique().tolist() subsets = df['subset'].unique().tolist() # for every category and subset combination estimate the survival fun. for category in categories: for subset in subsets: sub_df = df[(df['category'] == category) & (df['subset'] == subset)] T = sub_df['value'] E = None # default is nothing is censored if len(T) <= 3: continue if event_observed: # find observation boolean value for every duration E = event_observed[0].merge(sub_df, how='right', on='id') E = [not x for x in pd.isnull(E['value_x'])] assert len(E) == len(T) if estimator == 'NelsonAalen': fitter = NelsonAalenFitter() fitter.fit(durations=T, event_observed=E) estimate = fitter.cumulative_hazard_[ 'NA_estimate'].tolist() ci_lower = fitter.confidence_interval_[ 'NA_estimate_lower_0.95'].tolist() ci_upper = fitter.confidence_interval_[ 'NA_estimate_upper_0.95'].tolist() elif estimator == 'KaplanMeier': fitter = KaplanMeierFitter() fitter.fit(durations=T, event_observed=E) # noinspection PyUnresolvedReferences estimate = fitter.survival_function_[ 'KM_estimate'].tolist() ci_lower = fitter.confidence_interval_[ 'KM_estimate_lower_0.95'].tolist() ci_upper = fitter.confidence_interval_[ 'KM_estimate_upper_0.95'].tolist() else: error = 'Unknown estimator: {}'.format(estimator) logger.exception(error) raise ValueError(error) timeline = fitter.timeline.tolist() if not stats.get(category): stats[category] = {} stats[category][subset] = { 'timeline': timeline, 'estimate': estimate, 'ci_lower': ci_lower, 'ci_upper': ci_upper } return { 'label': df['feature'].tolist()[0], 'categories': categories, 'subsets': subsets, 'stats': stats }
def fit( self, durations, event_observed=None, timeline=None, entry=None, label="BFH_estimate", alpha=None, ci_labels=None, ): # pylint: disable=too-many-arguments """ Parameters ---------- durations: an array, or pd.Series, of length n duration subject was observed for timeline: return the best estimate at the values in timelines (positively increasing) event_observed: an array, or pd.Series, of length n True if the the death was observed, False if the event was lost (right-censored). Defaults all True if event_observed==None entry: an array, or pd.Series, of length n relative time when a subject entered the study. This is useful for left-truncated observations, i.e the birth event was not observed. If None, defaults to all 0 (all birth events observed.) label: string a string to name the column of the estimate. alpha: float, optional (default=0.05) the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: iterable add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha> Returns ------- self, with new properties like ``survival_function_``. """ self._label = label alpha = coalesce(alpha, self.alpha) naf = NelsonAalenFitter(alpha=alpha) naf.fit( durations, event_observed=event_observed, timeline=timeline, label=label, entry=entry, ci_labels=ci_labels ) self.durations, self.event_observed, self.timeline, self.entry, self.event_table = ( naf.durations, naf.event_observed, naf.timeline, naf.entry, naf.event_table, ) # estimation self.survival_function_ = np.exp(-naf.cumulative_hazard_) self.confidence_interval_ = np.exp(-naf.confidence_interval_) # estimation methods self._estimation_method = "survival_function_" self._estimate_name = "survival_function_" self._update_docstrings() # plotting functions self.plot_survival_function = self.plot return self
plt.ylim(0, 1) plt.title("Lifespans of different Question types in First 500 Days") # Test of significances between Question Types from lifelines.statistics import logrank_test results = logrank_test(T[short], T[~short], E[short], E[~short], alpha=.99) results.print_summary() # Applying output to a hazord curve. from lifelines import NelsonAalenFitter naf = NelsonAalenFitter() naf.fit(T, event_observed=E) naf.plot() #By question length naf.fit(T[short], event_observed=E[short], label="Shorter Questions") ax = naf.plot(loc=slice(0, 200)) naf.fit(T[~short], event_observed=E[~short], label="Longer Questions") naf.plot(ax=ax, loc=slice(0, 200)) plt.title("Cumulative hazard function by Question Length (up to 2000= days)") # Aalen's Additive Model from lifelines import CoxPHFitter cph = CoxPHFitter() #Covariance matrix import patsy
data_events = np.append(data_events,np.array([time_to_event]*num_repair)) for v in sales_dict.values(): #investigate why some negative leftovers on certain valid dates , more repairs than sales ??? if v>0: data_events = np.append(data_events,np.zeros(v)) t=[] if len(data_events)==0: all_data.append([0]*19) continue data_events[data_events==0] = 70 C= data_events <70 naf = NelsonAalenFitter() naf.fit(data_events, event_observed=C ) y_h = np.array(naf.cumulative_hazard_).reshape(len(naf.cumulative_hazard_)) x= np.array(naf.cumulative_hazard_.index).astype(int) seen_data_events.add(0) seen_data_events.add(70) if len(y_h) > 14: slope, intercept, r_value, p_value, std_err = stats.linregress(x[len(x)-5:len(x)-1],y_h[len(y_h)-5:len(y_h)-1]) #plt.figure() #plt.plot(x, y_h, 'ko') #plt.plot(x, linear_f(x,slope,intercept ), 'r-') #plt.legend() #plt.show()
~module_survival_data['module_name']. isin(['Pre-CLIx_Survey', 'Post-CLIx_Survey'])] module_survival_data['event'] = 1 groups = module_survival_data['module_name'] T = module_survival_data['duration_weeks'] E = module_survival_data['event'] from lifelines import NelsonAalenFitter naf = NelsonAalenFitter() bandwidth = 3. for i, each in enumerate(list(module_survival_data['module_name'].unique())): ix = (groups == each) naf.fit(T[ix], event_observed=E[ix], label=each) if i == 0: ax = naf.plot_hazard(bandwidth=bandwidth, ci_show=False) else: ax = naf.plot_hazard(ax=ax, bandwidth=bandwidth, ci_show=False) ax.set_title("Hazard function of different modules | bandwidth=%.1f" % bandwidth) # Survival curves for tools import pandas from datetime import datetime, timedelta from lifelines import KaplanMeierFitter data_path = '/home/parthae/Documents/Projects/TISS_Git/projects/data_collation/data/data_latest' cg_data = pandas.read_csv( data_path +
plt.show() ax = plt.subplot(111) for r in data['Has_Children'].unique(): ix = data['Has_Children'] == r kmf.fit(data['Duration'].loc[ix], data['Divorce'].loc[ix], label=r) sns.set() ax = kmf.plot(title='Mariage Survival Estimate Based on Children', ax=ax, linewidth=2.5) #Export the figure plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/Children.pdf') plt.show() naf = NelsonAalenFitter() naf.fit(data['Duration'], data['Divorce']) sns.set() naf.plot(title='Cumulative hazard over time', legend=False) print(naf.cumulative_hazard_.head(32)) plt.savefig( '/home/raed/Dropbox/INSE - 6320/Final Project/Cumulative_Hazard_function.pdf' ) plt.show() ax = plt.subplot(111) for r in data['Couple_Race'].unique(): ix = data['Couple_Race'] == r naf.fit(data['Duration'].loc[ix], data['Divorce'].loc[ix], label=r) sns.set() ax = naf.plot(title='Cumulative Hazard by Couple Race ', ax=ax,
from lifelines import WeibullFitter wf = WeibullFitter() wf.fit(T, E) print(wf.lambda_, wf.rho_) wf.print_summary() wf.plot() ############################################################ # NelsonAalenFitter ############################################################ from lifelines import NelsonAalenFitter naf = NelsonAalenFitter() naf.fit(T, event_observed=E) naf.plot() # univariate analysis: cum hazard # ORIG_CHN ax = plt.subplot() for chn in df_cox.ORIG_CHN.unique(): is_chn = (df_cox.ORIG_CHN == chn) naf.fit(T[is_chn], event_observed=E[is_chn], label=chn) naf.plot(ax=ax) # PURPOSE ax = plt.subplot() for purpose in df_cox.PURPOSE.unique(): is_pur = (df_cox.PURPOSE == purpose) naf.fit(T[is_pur], event_observed=E[is_pur], label=purpose)
bins0 = config.BIN0 bins1 = config.BIN1 df = pd.read_stata("wichert.dta") data_ = zip(df.time/max(df.time), df.event.astype(int)) data = [(a, b) for (a,b) in data_ if a >= config.GAMMA] print("[*] Remove #%d outliers" % (len(data_) - len(data))) N = len(df) # number of data points #kmf = KaplanMeierFitter() (T, E) = zip(*data) #kmf.fit(T, event_observed=E) naf = NelsonAalenFitter() naf.fit(T, event_observed=E) #ax = pyplot.subplot(121) #naf.plot(ax=ax) #ax = pyplot.subplot(122) #kmf.plot(ax=ax) true_value = naf.cumulative_hazard_.values #naf.cumulative_hazard_.to_csv("naf.csv") #pyplot.show() data0 = [ a for (a,b) in data if b == 0 ] data1 = [ a for (a,b) in data if b == 1 ] his0,bin_edges0 = np.histogram(data0, bins=bins0, range=(config.GAMMA, 1))
2 13 1 miR-137 3 13 1 miR-137 4 19 1 miR-137 """ T = df['T'] E = df['E'] # Fit the survival curve kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # or, more succiently, kmf.fit(T, E) kmf.plot() # Plot cumulative hazard function naf = NelsonAalenFitter() naf.fit(T, E) naf.plot() #------------------------------------------------------------------------------ # Multiple groups #------------------------------------------------------------------------------ groups = df['group'] ix = (groups == 'miR-137') kmf.fit(T[~ix], E[~ix], label='control') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='miR-137') kmf.plot(ax=ax) plt.show()
class Node: score = 0 split_val = None split_var = None lhs = None rhs = None chf = None chf_terminal = None terminal = False def __init__(self, x, y, tree, f_idxs, n_features, unique_deaths=1, min_leaf=1, random_state=None): """ A Node of the Survival Tree. :param x: The input samples. Should be a Dataframe with the shape [n_samples, n_features]. :param y: The target values as a Dataframe with the survival time in the first column and the event. :param tree: The corresponding Survival Tree :param f_idxs: The indices of the features to use. :param n_features: The number of features to use. :param unique_deaths: The minimum number of unique deaths required to be at a leaf node. :param min_leaf: The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_leaf training samples in each of the left and right branches. """ self.x = x self.y = y self.tree = tree self.f_idxs = f_idxs self.n_features = n_features self.unique_deaths = unique_deaths self.random_state = random_state self.min_leaf = min_leaf self.grow_tree() def grow_tree(self): """ Grow tree by calculating the Nodes recursively. :return: self """ unique_deaths = self.y.iloc[:, 1].reset_index().drop_duplicates().sum()[1] if unique_deaths <= self.unique_deaths: self.compute_terminal_node() return self self.score, self.split_val, self.split_var, lhs_idxs_opt, rhs_idxs_opt = splitting.find_split( self) if self.split_var is None: self.compute_terminal_node() return self if self.random_state is None: lf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features] rf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features] else: lf_idxs = np.random.RandomState( seed=self.random_state).permutation( self.x.shape[1])[:self.n_features] rf_idxs = np.random.RandomState( seed=self.random_state).permutation( self.x.shape[1])[:self.n_features] self.lhs = Node(self.x.iloc[lhs_idxs_opt, :], self.y.iloc[lhs_idxs_opt, :], self.tree, lf_idxs, self.n_features, min_leaf=self.min_leaf, random_state=self.random_state) self.rhs = Node(self.x.iloc[rhs_idxs_opt, :], self.y.iloc[rhs_idxs_opt, :], self.tree, rf_idxs, self.n_features, min_leaf=self.min_leaf, random_state=self.random_state) return self def compute_terminal_node(self): """ Compute the terminal node if condition has reached. :return: self """ self.terminal = True self.chf = NelsonAalenFitter() t = self.y.iloc[:, 0] e = self.y.iloc[:, 1] self.chf.fit(t, event_observed=e, timeline=self.tree.timeline) return self def predict(self, x): """ Predict the cumulative hazard function if its a terminal node. If not walk through the tree. :param x: The input sample. :return: Predicted cumulative hazard function if terminal node """ if self.terminal: self.tree.chf = self.chf.cumulative_hazard_ self.tree.chf = self.tree.chf.iloc[:, 0] return self.tree.chf else: if x[self.split_var] <= self.split_val: self.lhs.predict(x) else: self.rhs.predict(x)
import numpy as np import pandas as pd from lifelines import NelsonAalenFitter path = './totalData.xlsx' data = pd.read_excel(path) duration = data.totaltime indicator = data.failure naf = NelsonAalenFitter() naf.fit(duration, indicator) naf.plot()