def plot_survival_curve(kmf: KaplanMeierFitter, exf: ExponentialFitter, state: int, stat_counts: Tuple[int, int, int], save_path: Path, x_right_lim: float = None): on_off_str = ['on', 'off'] obs_off_str = ['obs', 'off'] ax = kmf.plot() exf.plot_survival_function(ax=ax, ci_show=False) ax.get_legend().remove() plt.xlabel(r'$\tau_{{{}}}$ (s)'.format(on_off_str[state]), fontsize=16) plt.ylabel('probability', fontsize=16) k_str = r'$k_{{{}}}$ = {:.1f} s'.format(obs_off_str[state], exf.lambda_) string = '{}, {}, {}'.format(*stat_counts) plt.text(0.6, 0.8, k_str, transform=ax.transAxes, fontsize=14) plt.text(0.6, 0.6, string, transform=ax.transAxes, fontsize=14) plt.xlim(left=0) if x_right_lim is not None: plt.xlim(right=x_right_lim) plt.rcParams['svg.fonttype'] = 'none' plt.savefig(save_path, format='svg', Transparent=True, dpi=300, bbox_inches='tight') plt.close()
def test_qq_plot_left_censoring_with_known_distribution(self, block): N = 300 T_actual = scipy.stats.fisk(8, 0, 1).rvs(N) MIN_0 = np.percentile(T_actual, 5) MIN_1 = np.percentile(T_actual, 10) T = T_actual.copy() ix = np.random.randint(3, size=N) T = np.where(ix == 0, np.maximum(T, MIN_0), T) T = np.where(ix == 1, np.maximum(T, MIN_1), T) E = T_actual == T fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([ WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter() ]): model.fit_left_censoring(T, E) ax = qq_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle( "test_qq_plot_left_censoring_with_known_distribution") self.plt.show(block=block)
def test_rmst_exactely_with_known_solution(): T = np.random.exponential(2, 100) exp = ExponentialFitter().fit(T) lambda_ = exp.lambda_ assert abs(utils.restricted_mean_survival_time(exp) - lambda_) < 0.001 assert abs(utils.restricted_mean_survival_time(exp, t=lambda_) - lambda_ * (np.e - 1) / np.e) < 0.001
def find_two_state_dwell_time(parameter_file_path: Path, sheet_list: List[str]): datapath = imscrollIO.def_data_path() state_category = '1' state_list = ['low', 'high'] im_format = 'svg' for i_sheet in sheet_list: dfs = pd.read_excel(parameter_file_path, sheet_name=i_sheet) nFiles = dfs.shape[0] interval_list = [] n_good_traces = 0 for iFile in range(0, nFiles): filestr = dfs.filename[iFile] try: all_data, AOI_categories = binding_kinetics.load_all_data( datapath / (filestr + '_all.json')) except FileNotFoundError: print('{} file not found'.format(filestr)) continue print(filestr + ' loaded') if state_category in AOI_categories['analyzable']: aoi_list = AOI_categories['analyzable'][state_category] n_good_traces += len(aoi_list) interval_list.append(all_data['intervals'].sel(AOI=aoi_list)) max_time = all_data['data'].time.values.max() for i, item in enumerate(state_list): dwells = binding_kinetics.extract_dwell_time(interval_list, i) if len(dwells.duration) == 0: print('no {} state found'.format(item)) continue kmf = KaplanMeierFitter() exf = ExponentialFitter() kmf.fit(dwells.duration, dwells.event_observed) exf.fit(dwells.duration, dwells.event_observed) n_event = np.count_nonzero(dwells.event_observed) n_censored = len(dwells.event_observed) - n_event stat_counts = (n_event, n_censored, n_good_traces) save_fig_path = datapath / (i_sheet + '_' + item + '_dwell' + '.' + im_format) plot_survival_curve(kmf, exf, i, stat_counts, save_fig_path, x_right_lim=max_time)
def test_rmst_approximate_solution(): T = np.random.exponential(2, 4000) exp = ExponentialFitter().fit(T, timeline=np.linspace(0, T.max(), 10000)) lambda_ = exp.lambda_ with pytest.warns(exceptions.ApproximationWarning) as w: assert (abs( utils.restricted_mean_survival_time(exp, t=lambda_) - utils.restricted_mean_survival_time(exp.survival_function_, t=lambda_)) < 0.001)
def test_rmst_variance(): T = np.random.exponential(2, 1000) expf = ExponentialFitter().fit(T) hazard = 1 / expf.lambda_ t = 1 sq = 2 / hazard ** 2 * (1 - np.exp(-hazard * t) * (1 + hazard * t)) actual_mean = 1 / hazard * (1 - np.exp(-hazard * t)) actual_var = sq - actual_mean ** 2 assert abs(utils.restricted_mean_survival_time(expf, t=t, return_variance=True)[0] - actual_mean) < 0.001 assert abs(utils.restricted_mean_survival_time(expf, t=t, return_variance=True)[1] - actual_var) < 0.001
def test_qq_plot_left_censoring2(self, block): df = load_lcd() fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([ WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter() ]): model.fit_left_censoring(df["T"], df["E"]) ax = qq_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle("test_qq_plot_left_censoring2") self.plt.show(block=block)
def test_right_censorship_cdf_plots(self, block): df = load_rossi() fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([ WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter() ]): model.fit(df["week"], df["arrest"]) ax = cdf_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle("test_right_censorship_cdf_plots") self.plt.show(block=block)
def test_left_censorship_cdf_plots(self, block): df = load_nh4() fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([ WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter() ]): model.fit_left_censoring(df["NH4.mg.per.L"], ~df["Censored"]) ax = cdf_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle("test_left_censorship_cdf_plots") self.plt.show(block=block)
def test_qq_plot_right_censoring_with_known_distribution(self, block): N = 3000 T_actual = scipy.stats.fisk(8, 0, 1).rvs(N) C = scipy.stats.fisk(8, 0, 1).rvs(N) E = T_actual < C T = np.minimum(T_actual, C) fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([ WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter() ]): model.fit(T, E) ax = qq_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle( "test_qq_plot_right_censoring_with_known_distribution") self.plt.show(block=block)
def bayesian_model_estimation(T, E, iter_interpolate=2, n_pts=20): """ T is durations E is binary event flag iter_interpolate is number of iterations in posterior grid interpolation refinement (int, min.=1) n_pts is number of points in posterior """ # Plot non-parametric curves kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) kmf.plot() # kmf.cumulative_density_.plot(figsize=(7,6)) naf = NelsonAalenFitter() naf.fit(T, event_observed=E) plt.figure(figsize=(7, 6)) naf.plot() plt.title('Cumulative hazard rate') # Fit exponential cumulative hazard model exf = ExponentialFitter().fit( T, E, label='ExponentialFitter' ) # See https://lifelines.readthedocs.io/en/latest/Survival%20analysis%20with%20lifelines.html exf.plot_cumulative_hazard() print('fitted lambda = {}'.format( 1 / exf.lambda_)) # Confidence bounds on this? --> bootstrap? # Plot groundtruth curve plt.figure(figsize=(7, 6)) x = np.arange(1, 30) plt.plot(x, expon(scale=1 / target_rate).sf(x), 'g--', lw=2.5, alpha=.6, label='target') plt.plot(x, expon(scale=exf.lambda_).sf(x), 'r-', lw=3, alpha=.7, label='fitted') plt.legend() plt.xlabel('duration (time since event arrival') plt.title('Survival curve') # Bayesian inference of lambda # ============================ lam_range = np.linspace(0, .2, n_pts) for it in range(1, iter_interpolate + 1): print('\niteration {}'.format(it)) prior = np.ones_like(lam_range) prior /= np.sum(prior) logprior = np.log(prior) logprior /= np.sum(logprior) # Compute likelihood in original dimension (dangerously small numbers!) # post = prior # for duration, event_flag in zip(T, E): # if event_flag==1: # post *= expon(scale=1/lam_range).pdf(duration) # else: # post *= expon(scale=1/lam_range).sf(duration) # Compute likelihood in log dimension logpost = logprior #- lam_range*T.sum() + np.log(lam_range)*(1 - E).sum() # <-- vector implentation is wrong for duration, event_flag in zip(T, E): if event_flag == 1: logpost += expon(scale=1 / lam_range).logpdf(duration) else: logpost += expon(scale=1 / lam_range).logsf(duration) # Trick: shift entire log dist. by max.loglikel. before exponentiation to reduce potential underflow: maxlogl = np.max(logpost) post = np.exp(logpost - maxlogl) post /= np.sum(post) ExpectedVal = np.dot(lam_range, post) print('Mean of lambda posterior = {}'.format(ExpectedVal)) print('MAE = {}'.format(np.abs(ExpectedVal - target_rate))) # Plot lambda posterior plt.figure(figsize=(7, 6)) plt.plot(lam_range, post, 'b.-', lw=1, label='Bayes') plt.vlines(1 / exf.lambda_, 0, 1.2 * np.max(post), color='m', lw=3, alpha=.6, label='MLE') plt.vlines(target_rate, 0, 1.2 * np.max(post), color='orange', lw=3, alpha=.9, label='target') plt.vlines(ExpectedVal, 0, 1.2 * np.max(post), color='b', lw=3, alpha=.6, label='Bayes EV') plt.legend() plt.title('Lambda estimate (iteration {})'.format(it)) plt.xlabel('lambda') # Refine posterior grid evaluation points if it <= iter_interpolate: cumul_prob_dens = post.cumsum() f = interp1d(cumul_prob_dens, lam_range) cdf_new_grid_pts = np.linspace(1e-2, 1 - 1e-2, n_pts) lam_range = f(cdf_new_grid_pts)
model = model # instantiate the class to create an object for the input model # Two Cohorts are compared. 1. Streaming TV Not Subsribed by Users, 2. Streaming TV subscribed by the users. groups = data['StreamingTV'] # group i1 , having the pandas series for the 1st cohort i1 = (groups == 'No') # group i2 , having the pandas series for the 2nd cohort i2 = (groups == 'Yes') # fit the model for 1st cohort model.fit(T[i1], E[i1], label='Not Subscribed StreamingTV') a1 = model.plot(ax=axes) # fit the model for 2nd cohort model.fit(T[i2], E[i2], label='Subscribed StreamingTV') model.plot(ax=axes) # Churn by subscribe for the lognormal model churn_by_subscribe(LogNormalFitter(), axes[0][0]) # Churn by subscribe for the weibull model churn_by_subscribe(WeibullFitter(), axes[0][1]) # Churn by subscribe for the loglogistic model churn_by_subscribe(LogLogisticFitter(), axes[1][0]) # Churn by subscribe for the Exponential model churn_by_subscribe(ExponentialFitter(), axes[1][1]) # Function for adding subtitles and labels plot_details('Subscribed', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
import pandas as pd data = pd.read_csv('Dataset/telco_customer.csv') data['tenure'] = pd.to_numeric(data['tenure']) data = data[data['tenure'] > 0] # Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data. data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0) fig, axes = plt.subplots(2, 2, figsize=(16, 12)) T = data['tenure'] E = data['Churn'] wbf = WeibullFitter().fit(T, E, label='WeibullFitter') ef = ExponentialFitter().fit(T, E, label='ExponentialFitter') lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter') llf = LogLogisticFitter().fit(T, E, label='LogLogisticFitter') wbf.plot_cumulative_hazard(ax=axes[0][0]) ef.plot_cumulative_hazard(ax=axes[0][1]) lnf.plot_cumulative_hazard(ax=axes[1][0]) llf.plot_cumulative_hazard(ax=axes[1][1]) plt.suptitle( 'Parametric Model Implementation of the Telco dataset using different models' ) fig.text(0.5, 0.04, 'Timeline', ha='center') fig.text(0.04, 0.5, 'Probability', va='center', rotation='vertical') plt.savefig('Images/WeiExpLogx.jpeg')
def churn_by_gender(model, axes): model = model # instantiate the class to create an object for required model groups = data['gender'] # group i1 , having the pandas series for the 1st cohort j1 = (groups == 'Male') # group i2 , having the pandas series for the 2nd cohort j2 = (groups == 'Female') # fit the model for 1st cohort model.fit(T[j1], E[j1], label='Male') a1 = model.plot(ax=axes) # fit the model for 2nd cohort model.fit(T[j2], E[j2], label='Female') model.plot(ax=axes) # Churn by gender for the lognormal model churn_by_gender(LogNormalFitter(), axes[0][0]) # Churn by gender for the weibull model churn_by_gender(WeibullFitter(), axes[0][1]) # Churn by gender for the loglogistic model churn_by_gender(LogLogisticFitter(), axes[1][0]) # Churn by gender for the Exponential model churn_by_gender(ExponentialFitter(), axes[1][1]) # Function for adding subtitles and labels plot_details('Gender', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
model = model # instantiate the class to create an object for choosen model # Three cohorts are compared on the basis of the contract groups = data['Contract'] x1 = (groups == 'Month-to-month') x2 = (groups == 'Two year') x3 = (groups == 'One year') model.fit(T[x1], E[x1], label='Month-to-month') ax = model.plot(ax=axes) model.fit(T[x2], E[x2], label='Two year') ax1 = model.plot(ax=axes) ac1 = model.plot model.fit(T[x3], E[x3], label='One year') model.plot(ax=axes) # Churn by contract for the lognormal model churn_by_contract(LogNormalFitter(), axes[0][0]) # Churn by contract for the weibull model churn_by_contract(WeibullFitter(), axes[0][1]) # Churn by contract for the loglogistic model churn_by_contract(LogLogisticFitter(), axes[1][0]) # Churn by contract for the Exponential model churn_by_contract(ExponentialFitter(), axes[1][1]) # Function for adding subtitles and labels plot_details('Contract', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
LogLogisticFitter) import pandas as pd data = pd.read_csv('Dataset/telco_customer.csv') data['tenure'] = pd.to_numeric(data['tenure']) data = data[data['tenure'] > 0] # Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data. data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0) fig, axes = plt.subplots(2, 2, figsize=(16, 12)) T = data['tenure'] E = data['Churn'] wbf = WeibullFitter().fit(T, E, label='WeibullFitter') ef = ExponentialFitter().fit(T, E, label='ExponentialFitter') lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter') llf = LogLogisticFitter().fit(T, E, label='LogLogisticFitter') wbf.plot_survival_function(ax=axes[0][0]) ef.plot_survival_function(ax=axes[0][1]) lnf.plot_survival_function(ax=axes[1][0]) llf.plot_survival_function(ax=axes[1][1]) plt.suptitle( 'Implementation of Paramteric Models to create survival functions on the teleco dataset' ) fig.text(0.5, 0.04, 'Timeline', ha='center') fig.text(0.04, 0.5, 'Probability', va='center', rotation='vertical') plt.savefig('Images/SurvivalFunctions.jpeg')
def test_find_best_parametric_model_can_accept_other_models(): T = np.random.exponential(2, 1000) model, score = utils.find_best_parametric_model( T, additional_models=[ExponentialFitter(), ExponentialFitter()]) assert True
def churn_by_partner(model, axes): model = model # instantiate the class to create an object of required model groups = data['Partner'] # group i1 , having the pandas series for the 1st cohort k1 = (groups == 'No') # group i2 , having the pandas series for the 2nd cohort k2 = (groups == 'Yes') # fit the model for 1st cohort model.fit(T[k1], E[k1], label='Do not have a partner') a1 = model.plot(ax=axes) # fit the model for 2nd cohort model.fit(T[k2], E[k2], label='Have a partner') model.plot(ax=axes) # Churn by partner for the lognormal model churn_by_partner(LogNormalFitter(), axes[0][0]) # Churn by partner for the weibull model churn_by_partner(WeibullFitter(), axes[0][1]) # Churn by partner for the loglogistic model churn_by_partner(LogLogisticFitter(), axes[1][0]) # Churn by partner for the Exponential model churn_by_partner(ExponentialFitter(), axes[1][1]) # Function for adding subtitles and labels plot_details('Partner', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)