def test_survival_difference_at_fixed_point_in_time_test_parametric(): df = load_waltons() ix = df["group"] == "miR-137" wf1 = WeibullFitter().fit(df.loc[ix]["T"], df.loc[ix]["E"]) wf2 = WeibullFitter().fit(df.loc[~ix]["T"], df.loc[~ix]["E"]) result = stats.survival_difference_at_fixed_point_in_time_test(10, wf1, wf2) assert result.p_value < 0.05
def test_survival_difference_at_fixed_point_in_time_test_interval_censoring(): T1 = np.random.exponential(1e-6, size=1000) T2 = np.random.exponential(1e-6, size=1000) E = T1 > T2 T = np.maximum(T1, T2) wf1 = WeibullFitter().fit_interval_censoring(T, T) wf2 = WeibullFitter().fit_interval_censoring(2 * T, 2 * T) result = stats.survival_difference_at_fixed_point_in_time_test(T.mean(), wf1, wf2) assert result.p_value < 0.05
def test_label_can_be_changed_on_univariate_fitters(self, block): T = np.random.exponential(5, size=(2000, 1)) ** 2 wf = WeibullFitter().fit(T, timeline=np.linspace(0, 5)) ax = wf.plot_hazard(label="abc") wf.plot_cumulative_hazard(ax=ax, label="123") self.plt.title("test_label_can_be_changed_on_univariate_fitters") self.plt.show(block=block) return
def test_logx_plotting(self, block): waltons = load_waltons() kmf = KaplanMeierFitter().fit(np.exp(waltons["T"]), waltons["E"], timeline=np.logspace(0, 40)) ax = kmf.plot(logx=True) wf = WeibullFitter().fit(np.exp(waltons["T"]), waltons["E"], timeline=np.logspace(0, 40)) wf.plot_survival_function(logx=True, ax=ax) self.plt.title("test_logx_plotting") self.plt.show(block=block)
def fit_weibull(df, x_grid=None): # Initialize the model and fit our data wbf = WeibullFitter() wbf.fit(df["offset"], df["observed"]) # Get weibull parameters params = {"scale": wbf.lambda_, "shape": wbf.rho_} # If x_grid is provided, return y if x_grid is not None: pdf = wbf.density_at_times(x_grid).to_numpy() return params, pdf else: return params
def test_qq_plot_left_censoring_with_known_distribution(self, block): N = 300 T_actual = scipy.stats.fisk(8, 0, 1).rvs(N) MIN_0 = np.percentile(T_actual, 5) MIN_1 = np.percentile(T_actual, 10) T = T_actual.copy() ix = np.random.randint(3, size=N) T = np.where(ix == 0, np.maximum(T, MIN_0), T) T = np.where(ix == 1, np.maximum(T, MIN_1), T) E = T_actual == T fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([ WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter() ]): model.fit_left_censoring(T, E) ax = qq_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle( "test_qq_plot_left_censoring_with_known_distribution") self.plt.show(block=block)
def test_left_censorship_cdf_plots(self, block): df = load_nh4() fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter()]): model.fit_left_censoring(df["NH4.mg.per.L"], ~df["Censored"]) ax = cdf_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle("test_left_censorship_cdf_plots") self.plt.show(block=block)
def test_right_censorship_cdf_plots(self, block): df = load_rossi() fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter()]): model.fit(df["week"], df["arrest"]) ax = cdf_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle("test_right_censorship_cdf_plots") self.plt.show(block=block)
def test_qq_plot_left_censoring2(self, block): df = load_lcd() fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter()]): model.fit_left_censoring(df["T"], df["E"]) ax = qq_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle("test_qq_plot_left_censoring2") self.plt.show(block=block)
def one_detection_limit(N, fraction_below_limit): T_actual = 0.5 * np.random.weibull(1, size=N) MIN_1 = np.percentile(T_actual, fraction_below_limit) T = np.maximum(MIN_1, T_actual) E = T_actual > MIN_1 wf = WeibullFitter().fit(T, E, left_censorship=True) return wf
def test_qq_plot_with_weights_and_entry(self, block): from lifelines.utils import survival_events_from_table df = pd.DataFrame(index=[60, 171, 263, 427, 505, 639]) df["death"] = [1, 1, 1, 0, 1, 0] df["censored"] = [0, 0, 0, 3, 0, 330] T, E, W = survival_events_from_table(df, observed_deaths_col="death", censored_col="censored") wf = WeibullFitter().fit(T, E, weights=W, entry=0.0001 * np.ones_like(T)) ax = qq_plot(wf) self.plt.suptitle("test_qq_plot_with_weights_and_entry") self.plt.show(block=block)
def test_qq_plot_right_censoring_with_known_distribution(self, block): N = 3000 T_actual = scipy.stats.fisk(8, 0, 1).rvs(N) C = scipy.stats.fisk(8, 0, 1).rvs(N) E = T_actual < C T = np.minimum(T_actual, C) fig, axes = self.plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([WeibullFitter(), LogNormalFitter(), LogLogisticFitter(), ExponentialFitter()]): model.fit(T, E) ax = qq_plot(model, ax=axes[i]) assert ax is not None self.plt.suptitle("test_qq_plot_right_censoring_with_known_distribution") self.plt.show(block=block)
def test_parametric_plotting_with_show_censors(self, block): n = 200 T = 50 * np.random.exponential(1, size=(n, 1)) ** 2 E = np.random.rand(n) > 0.2 wf = WeibullFitter().fit(T, E, timeline=np.linspace(0, 5, 1000)) wf.plot_cumulative_density(show_censors=True) self.plt.title("test_parametric_plotting_with_show_censors:cumulative_density") self.plt.show(block=block) wf.plot_survival_function(show_censors=True) self.plt.title("test_parametric_plotting_with_show_censors:survival_function") self.plt.show(block=block) wf.plot_cumulative_hazard(show_censors=True) self.plt.title("test_parametric_plotting_with_show_censors:cumulative_hazard") self.plt.show(block=block) return
def test_weibull_plotting(self, block): T = 50 * np.random.exponential(1, size=(200, 1)) ** 2 wf = WeibullFitter().fit(T, timeline=np.linspace(0, 5, 100)) wf.plot_hazard() self.plt.title("test_weibull_plotting:hazard") self.plt.show(block=block) wf.plot_cumulative_hazard() self.plt.title("test_weibull_plotting:cumulative_hazard") self.plt.show(block=block) return
def test_weibull_plotting(self, block): T = np.random.exponential(5, size=(2000, 1))**2 wf = WeibullFitter().fit(T) wf.plot_hazard() self.plt.title("test_weibull_plotting:hazard") self.plt.show(block=block) wf.plot_cumulative_hazard() self.plt.title("test_weibull_plotting:cumulative_hazard") self.plt.show(block=block) return
def _compute_likelihood_ratio_test(self): """ This function computes the likelihood ratio test for the Weibull model. We compare the existing model (with all the covariates) to the trivial model of no covariates. """ ll_null = WeibullFitter().fit(self.durations, self.event_observed)._log_likelihood ll_alt = self._log_likelihood test_stat = 2 * ll_alt - 2 * ll_null degrees_freedom = self.params_.shape[ 0] - 2 # diff in number of parameters between models p_value = chisq_test(test_stat, degrees_freedom=degrees_freedom) with np.errstate(invalid="ignore", divide="ignore"): return test_stat, degrees_freedom, -np.log2(p_value)
def three_detection_limit(N): T_actual = 0.5 * np.random.weibull(5, size=N) MIN_0 = np.percentile(T_actual, 5) MIN_1 = np.percentile(T_actual, 10) MIN_2 = np.percentile(T_actual, 30) MIN_3 = np.percentile(T_actual, 50) T = T_actual.copy() ix = np.random.randint(4, size=N) T = np.where(ix == 0, np.maximum(T, MIN_0), T) T = np.where(ix == 1, np.maximum(T, MIN_1), T) T = np.where(ix == 2, np.maximum(T, MIN_2), T) T = np.where(ix == 3, np.maximum(T, MIN_3), T) E = T_actual == T wf = WeibullFitter().fit(T, E, left_censorship=True) return wf
def survival(waits): """ Completes survival analysis for wait times that are longer than simulation period. """ N_years, N_scen, N_prob, M_boot = np.shape(waits) median = np.zeros([N_years, N_scen]) for year in range(N_years): for GCM in range(N_scen): #First re-structure as a long-vector wait_hold = np.copy( np.reshape(waits[year, GCM, :, :], [N_prob * M_boot])) if np.median(wait_hold) < 300: median[year, GCM] = np.median(wait_hold) else: E = wait_hold < 300. wait_hold[wait_hold > 300.] = N_years - year wf = WeibullFitter().fit(wait_hold, E) median[year, GCM] = wf.median_survival_time_ print('survival', GCM) print(N_years - year + 1) return median
# Generate a general Weibull distribution print('General Weibull distribution:') print(len('General Weibull distribution:') * '-') # bool_up = (df.Type == 'RunTime') # bool_down = ((df.Type == 'DownTime') & (df.ReasonId.isin(reasons_relative))) # continue_obs = ((df.Type == 'DownTime') & (df.ReasonId.isin(reasons_absolute + reasons_not_considered + reasons_availability))) # stop_obs = (df.Type == 'Break') bool_up = (df_task['Type'] == 'RunTime') # List of all RunTimes bool_down = (df_task['Type'].isin(['DownTime', 'Break'])) & (df_task['ReasonId'].isin(reasons_relative)) # List of all DownTimes in calculation bool_ignore = (df_task['Type'].isin(['DownTime', 'Break'])) & (df_task['ReasonId'].isin(reasons_availability + reasons_absolute)) # List of all breaks to ignore bool_break = (df_task['Type'].isin(['DownTime', 'Break'])) & (df_task['ReasonId'].isin(reasons_break)) # List of all breaks to stop observation uptime, downtime, obs_up, obs_down = duration_run_down(list(df_task['Duration'] / 3600), list(bool_up), list(bool_down), list(bool_ignore), list(bool_break), observation=True) wf = WeibullFitter() try: wf.fit(uptime, obs_up) weib = Weibull(wf.lambda_, wf.rho_) except: print(uptime) raise if print_all: print(weib) if export_all: general_dist = ET.SubElement(root, 'general_dist') general_dist.text = 'weibull' general_dist.set("lambda", str(wf.lambda_)) general_dist.set("rho", str(wf.rho_)) general_dist.set("mean", str(weib.mean_time())) plot_hist(uptime, obs_up, 99, weib)
kmf.fit(T[is_chn], event_observed=E[is_chn], label=chn) kmf.plot(ax=ax) # PURPOSE ax = plt.subplot() for purpose in df_cox.PURPOSE.unique(): is_pur = (df_cox.PURPOSE == purpose) kmf.fit(T[is_pur], event_observed=E[is_pur], label=purpose) kmf.plot(ax=ax) ############################################################ # WeibullFitter ############################################################ from lifelines import WeibullFitter wf = WeibullFitter() wf.fit(T, E) print(wf.lambda_, wf.rho_) wf.print_summary() wf.plot() ############################################################ # NelsonAalenFitter ############################################################ from lifelines import NelsonAalenFitter naf = NelsonAalenFitter() naf.fit(T, event_observed=E) naf.plot()
model = model # instantiate the class to create an object for choosen model # Three cohorts are compared on the basis of the contract groups = data['Contract'] x1 = (groups == 'Month-to-month') x2 = (groups == 'Two year') x3 = (groups == 'One year') model.fit(T[x1], E[x1], label='Month-to-month') ax = model.plot(ax=axes) model.fit(T[x2], E[x2], label='Two year') ax1 = model.plot(ax=axes) ac1 = model.plot model.fit(T[x3], E[x3], label='One year') model.plot(ax=axes) # Churn by contract for the lognormal model churn_by_contract(LogNormalFitter(), axes[0][0]) # Churn by contract for the weibull model churn_by_contract(WeibullFitter(), axes[0][1]) # Churn by contract for the loglogistic model churn_by_contract(LogLogisticFitter(), axes[1][0]) # Churn by contract for the Exponential model churn_by_contract(ExponentialFitter(), axes[1][1]) # Function for adding subtitles and labels plot_details('Contract', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
def churn_by_gender(model, axes): model = model # instantiate the class to create an object for required model groups = data['gender'] # group i1 , having the pandas series for the 1st cohort j1 = (groups == 'Male') # group i2 , having the pandas series for the 2nd cohort j2 = (groups == 'Female') # fit the model for 1st cohort model.fit(T[j1], E[j1], label='Male') a1 = model.plot(ax=axes) # fit the model for 2nd cohort model.fit(T[j2], E[j2], label='Female') model.plot(ax=axes) # Churn by gender for the lognormal model churn_by_gender(LogNormalFitter(), axes[0][0]) # Churn by gender for the weibull model churn_by_gender(WeibullFitter(), axes[0][1]) # Churn by gender for the loglogistic model churn_by_gender(LogLogisticFitter(), axes[1][0]) # Churn by gender for the Exponential model churn_by_gender(ExponentialFitter(), axes[1][1]) # Function for adding subtitles and labels plot_details('Gender', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
model = model # instantiate the class to create an object for the input model # Two Cohorts are compared. 1. Streaming TV Not Subsribed by Users, 2. Streaming TV subscribed by the users. groups = data['StreamingTV'] # group i1 , having the pandas series for the 1st cohort i1 = (groups == 'No') # group i2 , having the pandas series for the 2nd cohort i2 = (groups == 'Yes') # fit the model for 1st cohort model.fit(T[i1], E[i1], label='Not Subscribed StreamingTV') a1 = model.plot(ax=axes) # fit the model for 2nd cohort model.fit(T[i2], E[i2], label='Subscribed StreamingTV') model.plot(ax=axes) # Churn by subscribe for the lognormal model churn_by_subscribe(LogNormalFitter(), axes[0][0]) # Churn by subscribe for the weibull model churn_by_subscribe(WeibullFitter(), axes[0][1]) # Churn by subscribe for the loglogistic model churn_by_subscribe(LogLogisticFitter(), axes[1][0]) # Churn by subscribe for the Exponential model churn_by_subscribe(ExponentialFitter(), axes[1][1]) # Function for adding subtitles and labels plot_details('Subscribed', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
# -*- coding: utf-8 -*- import numpy as np from lifelines import WeibullFitter lambda_, rho_ = 2, 0.5 N = 10000 T_actual = lambda_ * np.random.exponential(1, size=N)**(1 / rho_) T_censor = lambda_ * np.random.exponential(1, size=N)**(1 / rho_) T = np.minimum(T_actual, T_censor) E = T_actual < T_censor time = [1.0] # lifelines computed confidence interval print(wf.fit(T, E, timeline=time).confidence_interval_cumulative_hazard_) bootstrap_samples = 10000 results = [] for _ in range(bootstrap_samples): ix = np.random.randint(0, 10000, 10000) wf = WeibullFitter().fit(T[ix], E[ix], timeline=time) results.append(wf.cumulative_hazard_at_times(time).values[0]) print(np.percentile(results, [2.5, 97.5]))
MIN_2 = np.percentile(T_actual, 30) MIN_3 = np.percentile(T_actual, 50) T = T_actual.copy() ix = np.random.randint(4, size=N) T = np.where(ix == 0, np.maximum(T, MIN_0), T) T = np.where(ix == 1, np.maximum(T, MIN_1), T) T = np.where(ix == 2, np.maximum(T, MIN_2), T) T = np.where(ix == 3, np.maximum(T, MIN_3), T) E = T_actual == T fig, axes = plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([WeibullFitter(), KaplanMeierFitter(), LogNormalFitter(), LogLogisticFitter()]): if isinstance(model, KaplanMeierFitter): model.fit(T, E, left_censorship=True, label=model.__class__.__name__) else: model.fit(T, E, left_censorship=True, label=model.__class__.__name__) model.plot_cumulative_density(ax=axes[i]) plt.tight_layout() for i, model in enumerate([WeibullFitter(), LogNormalFitter(), LogLogisticFitter()]): model.fit(T, E, left_censorship=True) fig, axes = plt.subplots(2, 1, figsize=(8, 6)) left_censorship_cdf_plot(model, ax=axes[0]) qq_plot(model, ax=axes[1])
# -*- coding: utf-8 -*- import numpy as np from lifelines import WeibullFitter lambda_, rho_ = 2, 0.5 N = 10000 T_actual = lambda_ * np.random.exponential(1, size=N) ** (1 / rho_) T_censor = lambda_ * np.random.exponential(1, size=N) ** (1 / rho_) T = np.minimum(T_actual, T_censor) E = T_actual < T_censor time = [1.0] # lifelines computed confidence interval wf = WeibullFitter() print(wf.fit(T, E, timeline=time).confidence_interval_cumulative_hazard_) bootstrap_samples = 10000 results = [] for _ in range(bootstrap_samples): ix = np.random.randint(0, 10000, 10000) wf = WeibullFitter().fit(T[ix], E[ix], timeline=time) results.append(wf.cumulative_hazard_at_times(time).values[0]) print(np.percentile(results, [2.5, 97.5]))
if __name__=='__main__': # ============================================================================= # Example dataset from Lifelines # https://lifelines.readthedocs.io/en/latest/Survival%20analysis%20with%20lifelines.html # ============================================================================= df = load_waltons() T = df['T'] E = df['E'] # kmf = KaplanMeierFitter() # kmf.fit(T, event_observed=E) wf = WeibullFitter().fit(T, E) # kmf.plot() # kmf.cumulative_density_.plot(figsize=(7,6)) naf = NelsonAalenFitter() naf.fit(T,event_observed=E) plt.figure(figsize=(8,6)) naf.plot() wf.plot() plt.title('cumulative hazard (Waltons dataset)') print('fitted Weibull parameters (MLE):') print('\tlambda = {}'.format(wf.lambda_)) print('\trho = {}'.format(wf.rho_))
# -*- coding: utf-8 -*- # aalen additive if __name__ == "__main__": import pandas as pd import numpy as np import time from lifelines import WeibullFitter np.random.seed(1) N = 250000 mu = 3 * np.random.randn() sigma = np.random.uniform(0.1, 3.0) X, C = np.exp(sigma * np.random.randn(N) + mu), np.exp(np.random.randn(N) + mu) E = X <= C T = np.minimum(X, C) wb = WeibullFitter() start_time = time.time() wb.fit(T, E) print("--- %s seconds ---" % (time.time() - start_time)) wb.print_summary(5)
def test_parametric_plotting_with_show_censors(self, block): n = 200 T = (np.sqrt(50) * np.random.exponential(1, size=n)) ** 2 E = T < 100 T = np.minimum(T, 100) wf = WeibullFitter().fit(T, E) wf.plot_density(show_censors=True) wf.plot_cumulative_density(show_censors=True) self.plt.title("test_parametric_plotting_with_show_censors:cumulative_density") self.plt.show(block=block) wf.plot_survival_function(show_censors=True) self.plt.title("test_parametric_plotting_with_show_censors:survival_function") self.plt.show(block=block) wf.plot_cumulative_hazard(show_censors=True) self.plt.title("test_parametric_plotting_with_show_censors:cumulative_hazard") self.plt.show(block=block) wf.plot_density(show_censors=True) self.plt.title("test_parametric_plotting_with_show_censors:density") self.plt.show(block=block) return
# -*- coding: utf-8 -*- import numpy as np from lifelines import WeibullFitter lambda_, rho_ = 2, 0.5 N = 10000 T_actual = lambda_ * np.random.exponential(1, size=N) ** (1 / rho_) T_censor = lambda_ * np.random.exponential(1, size=N) ** (1 / rho_) T = np.minimum(T_actual, T_censor) E = T_actual < T_censor time = [1.0] # lifelines computed confidence interval print(wf.fit(T, E, timeline=time).confidence_interval_cumulative_hazard_) bootstrap_samples = 10000 results = [] for _ in range(bootstrap_samples): ix = np.random.randint(0, 10000, 10000) wf = WeibullFitter().fit(T[ix], E[ix], timeline=time) results.append(wf.cumulative_hazard_at_times(time).values[0]) print(np.percentile(results, [2.5, 97.5]))
LogLogisticFitter) import pandas as pd data = pd.read_csv('Dataset/telco_customer.csv') data['tenure'] = pd.to_numeric(data['tenure']) data = data[data['tenure'] > 0] # Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data. data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0) fig, axes = plt.subplots(2, 2, figsize=(16, 12)) T = data['tenure'] E = data['Churn'] wbf = WeibullFitter().fit(T, E, label='WeibullFitter') ef = ExponentialFitter().fit(T, E, label='ExponentialFitter') lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter') llf = LogLogisticFitter().fit(T, E, label='LogLogisticFitter') wbf.plot_cumulative_hazard(ax=axes[0][0]) ef.plot_cumulative_hazard(ax=axes[0][1]) lnf.plot_cumulative_hazard(ax=axes[1][0]) llf.plot_cumulative_hazard(ax=axes[1][1]) plt.suptitle( 'Parametric Model Implementation of the Telco dataset using different models' ) fig.text(0.5, 0.04, 'Timeline', ha='center') fig.text(0.04, 0.5, 'Probability', va='center', rotation='vertical')