def plot_survival_curve(kmf: KaplanMeierFitter,
                        exf: ExponentialFitter,
                        state: int,
                        stat_counts: Tuple[int, int, int],
                        save_path: Path,
                        x_right_lim: float = None):
    on_off_str = ['on', 'off']
    obs_off_str = ['obs', 'off']
    ax = kmf.plot()
    exf.plot_survival_function(ax=ax, ci_show=False)
    ax.get_legend().remove()
    plt.xlabel(r'$\tau_{{{}}}$ (s)'.format(on_off_str[state]), fontsize=16)
    plt.ylabel('probability', fontsize=16)
    k_str = r'$k_{{{}}}$ = {:.1f} s'.format(obs_off_str[state], exf.lambda_)
    string = '{}, {}, {}'.format(*stat_counts)
    plt.text(0.6, 0.8, k_str, transform=ax.transAxes, fontsize=14)
    plt.text(0.6, 0.6, string, transform=ax.transAxes, fontsize=14)
    plt.xlim(left=0)
    if x_right_lim is not None:
        plt.xlim(right=x_right_lim)

    plt.rcParams['svg.fonttype'] = 'none'
    plt.savefig(save_path,
                format='svg',
                Transparent=True,
                dpi=300,
                bbox_inches='tight')
    plt.close()
Ejemplo n.º 2
0
    def test_qq_plot_left_censoring_with_known_distribution(self, block):
        N = 300
        T_actual = scipy.stats.fisk(8, 0, 1).rvs(N)

        MIN_0 = np.percentile(T_actual, 5)
        MIN_1 = np.percentile(T_actual, 10)

        T = T_actual.copy()
        ix = np.random.randint(3, size=N)

        T = np.where(ix == 0, np.maximum(T, MIN_0), T)
        T = np.where(ix == 1, np.maximum(T, MIN_1), T)
        E = T_actual == T

        fig, axes = self.plt.subplots(2, 2, figsize=(9, 5))
        axes = axes.reshape(4)
        for i, model in enumerate([
                WeibullFitter(),
                LogNormalFitter(),
                LogLogisticFitter(),
                ExponentialFitter()
        ]):
            model.fit_left_censoring(T, E)
            ax = qq_plot(model, ax=axes[i])
            assert ax is not None
        self.plt.suptitle(
            "test_qq_plot_left_censoring_with_known_distribution")
        self.plt.show(block=block)
Ejemplo n.º 3
0
def test_rmst_exactely_with_known_solution():
    T = np.random.exponential(2, 100)
    exp = ExponentialFitter().fit(T)
    lambda_ = exp.lambda_

    assert abs(utils.restricted_mean_survival_time(exp) - lambda_) < 0.001
    assert abs(utils.restricted_mean_survival_time(exp, t=lambda_) - lambda_ * (np.e - 1) / np.e) < 0.001
def find_two_state_dwell_time(parameter_file_path: Path,
                              sheet_list: List[str]):
    datapath = imscrollIO.def_data_path()
    state_category = '1'
    state_list = ['low', 'high']

    im_format = 'svg'
    for i_sheet in sheet_list:
        dfs = pd.read_excel(parameter_file_path, sheet_name=i_sheet)
        nFiles = dfs.shape[0]
        interval_list = []
        n_good_traces = 0
        for iFile in range(0, nFiles):
            filestr = dfs.filename[iFile]
            try:
                all_data, AOI_categories = binding_kinetics.load_all_data(
                    datapath / (filestr + '_all.json'))
            except FileNotFoundError:
                print('{} file not found'.format(filestr))
                continue

            print(filestr + ' loaded')
            if state_category in AOI_categories['analyzable']:
                aoi_list = AOI_categories['analyzable'][state_category]
                n_good_traces += len(aoi_list)
                interval_list.append(all_data['intervals'].sel(AOI=aoi_list))
        max_time = all_data['data'].time.values.max()
        for i, item in enumerate(state_list):
            dwells = binding_kinetics.extract_dwell_time(interval_list, i)
            if len(dwells.duration) == 0:
                print('no {} state found'.format(item))
                continue
            kmf = KaplanMeierFitter()
            exf = ExponentialFitter()
            kmf.fit(dwells.duration, dwells.event_observed)
            exf.fit(dwells.duration, dwells.event_observed)
            n_event = np.count_nonzero(dwells.event_observed)
            n_censored = len(dwells.event_observed) - n_event
            stat_counts = (n_event, n_censored, n_good_traces)
            save_fig_path = datapath / (i_sheet + '_' + item + '_dwell' + '.' +
                                        im_format)
            plot_survival_curve(kmf,
                                exf,
                                i,
                                stat_counts,
                                save_fig_path,
                                x_right_lim=max_time)
Ejemplo n.º 5
0
def test_rmst_approximate_solution():
    T = np.random.exponential(2, 4000)
    exp = ExponentialFitter().fit(T, timeline=np.linspace(0, T.max(), 10000))
    lambda_ = exp.lambda_

    with pytest.warns(exceptions.ApproximationWarning) as w:

        assert (abs(
            utils.restricted_mean_survival_time(exp, t=lambda_) -
            utils.restricted_mean_survival_time(exp.survival_function_,
                                                t=lambda_)) < 0.001)
Ejemplo n.º 6
0
def test_rmst_variance():

    T = np.random.exponential(2, 1000)
    expf = ExponentialFitter().fit(T)
    hazard = 1 / expf.lambda_
    t = 1

    sq = 2 / hazard ** 2 * (1 - np.exp(-hazard * t) * (1 + hazard * t))
    actual_mean = 1 / hazard * (1 - np.exp(-hazard * t))
    actual_var = sq - actual_mean ** 2

    assert abs(utils.restricted_mean_survival_time(expf, t=t, return_variance=True)[0] - actual_mean) < 0.001
    assert abs(utils.restricted_mean_survival_time(expf, t=t, return_variance=True)[1] - actual_var) < 0.001
Ejemplo n.º 7
0
 def test_qq_plot_left_censoring2(self, block):
     df = load_lcd()
     fig, axes = self.plt.subplots(2, 2, figsize=(9, 5))
     axes = axes.reshape(4)
     for i, model in enumerate([
             WeibullFitter(),
             LogNormalFitter(),
             LogLogisticFitter(),
             ExponentialFitter()
     ]):
         model.fit_left_censoring(df["T"], df["E"])
         ax = qq_plot(model, ax=axes[i])
         assert ax is not None
     self.plt.suptitle("test_qq_plot_left_censoring2")
     self.plt.show(block=block)
Ejemplo n.º 8
0
 def test_right_censorship_cdf_plots(self, block):
     df = load_rossi()
     fig, axes = self.plt.subplots(2, 2, figsize=(9, 5))
     axes = axes.reshape(4)
     for i, model in enumerate([
             WeibullFitter(),
             LogNormalFitter(),
             LogLogisticFitter(),
             ExponentialFitter()
     ]):
         model.fit(df["week"], df["arrest"])
         ax = cdf_plot(model, ax=axes[i])
         assert ax is not None
     self.plt.suptitle("test_right_censorship_cdf_plots")
     self.plt.show(block=block)
Ejemplo n.º 9
0
 def test_left_censorship_cdf_plots(self, block):
     df = load_nh4()
     fig, axes = self.plt.subplots(2, 2, figsize=(9, 5))
     axes = axes.reshape(4)
     for i, model in enumerate([
             WeibullFitter(),
             LogNormalFitter(),
             LogLogisticFitter(),
             ExponentialFitter()
     ]):
         model.fit_left_censoring(df["NH4.mg.per.L"], ~df["Censored"])
         ax = cdf_plot(model, ax=axes[i])
         assert ax is not None
     self.plt.suptitle("test_left_censorship_cdf_plots")
     self.plt.show(block=block)
Ejemplo n.º 10
0
    def test_qq_plot_right_censoring_with_known_distribution(self, block):
        N = 3000
        T_actual = scipy.stats.fisk(8, 0, 1).rvs(N)
        C = scipy.stats.fisk(8, 0, 1).rvs(N)
        E = T_actual < C
        T = np.minimum(T_actual, C)

        fig, axes = self.plt.subplots(2, 2, figsize=(9, 5))
        axes = axes.reshape(4)
        for i, model in enumerate([
                WeibullFitter(),
                LogNormalFitter(),
                LogLogisticFitter(),
                ExponentialFitter()
        ]):
            model.fit(T, E)
            ax = qq_plot(model, ax=axes[i])
            assert ax is not None
        self.plt.suptitle(
            "test_qq_plot_right_censoring_with_known_distribution")
        self.plt.show(block=block)
Ejemplo n.º 11
0
def bayesian_model_estimation(T, E, iter_interpolate=2, n_pts=20):
    """ T is durations
        E is binary event flag
        iter_interpolate is number of iterations in posterior grid interpolation refinement (int, min.=1)
        n_pts is number of points in posterior
    """
    # Plot non-parametric curves
    kmf = KaplanMeierFitter()
    kmf.fit(T, event_observed=E)
    kmf.plot()
    # kmf.cumulative_density_.plot(figsize=(7,6))

    naf = NelsonAalenFitter()
    naf.fit(T, event_observed=E)
    plt.figure(figsize=(7, 6))
    naf.plot()
    plt.title('Cumulative hazard rate')

    # Fit exponential cumulative hazard model
    exf = ExponentialFitter().fit(
        T, E, label='ExponentialFitter'
    )  #  See https://lifelines.readthedocs.io/en/latest/Survival%20analysis%20with%20lifelines.html
    exf.plot_cumulative_hazard()
    print('fitted lambda = {}'.format(
        1 / exf.lambda_))  # Confidence bounds on this?  --> bootstrap?

    # Plot groundtruth curve
    plt.figure(figsize=(7, 6))
    x = np.arange(1, 30)
    plt.plot(x,
             expon(scale=1 / target_rate).sf(x),
             'g--',
             lw=2.5,
             alpha=.6,
             label='target')
    plt.plot(x,
             expon(scale=exf.lambda_).sf(x),
             'r-',
             lw=3,
             alpha=.7,
             label='fitted')
    plt.legend()
    plt.xlabel('duration (time since event arrival')
    plt.title('Survival curve')

    # Bayesian inference of lambda
    # ============================
    lam_range = np.linspace(0, .2, n_pts)
    for it in range(1, iter_interpolate + 1):
        print('\niteration {}'.format(it))
        prior = np.ones_like(lam_range)
        prior /= np.sum(prior)
        logprior = np.log(prior)
        logprior /= np.sum(logprior)

        # Compute likelihood in original dimension (dangerously small numbers!)
        # post = prior
        # for duration, event_flag in zip(T, E):
        #     if event_flag==1:
        #         post *= expon(scale=1/lam_range).pdf(duration)
        #     else:
        #         post *= expon(scale=1/lam_range).sf(duration)

        # Compute likelihood in log dimension
        logpost = logprior  #- lam_range*T.sum() + np.log(lam_range)*(1 - E).sum() # <-- vector implentation is wrong
        for duration, event_flag in zip(T, E):
            if event_flag == 1:
                logpost += expon(scale=1 / lam_range).logpdf(duration)
            else:
                logpost += expon(scale=1 / lam_range).logsf(duration)
        # Trick: shift entire log dist. by max.loglikel. before exponentiation to reduce potential underflow:
        maxlogl = np.max(logpost)
        post = np.exp(logpost - maxlogl)
        post /= np.sum(post)
        ExpectedVal = np.dot(lam_range, post)
        print('Mean of lambda posterior = {}'.format(ExpectedVal))
        print('MAE = {}'.format(np.abs(ExpectedVal - target_rate)))

        # Plot lambda posterior
        plt.figure(figsize=(7, 6))
        plt.plot(lam_range, post, 'b.-', lw=1, label='Bayes')
        plt.vlines(1 / exf.lambda_,
                   0,
                   1.2 * np.max(post),
                   color='m',
                   lw=3,
                   alpha=.6,
                   label='MLE')
        plt.vlines(target_rate,
                   0,
                   1.2 * np.max(post),
                   color='orange',
                   lw=3,
                   alpha=.9,
                   label='target')
        plt.vlines(ExpectedVal,
                   0,
                   1.2 * np.max(post),
                   color='b',
                   lw=3,
                   alpha=.6,
                   label='Bayes EV')
        plt.legend()
        plt.title('Lambda estimate (iteration {})'.format(it))
        plt.xlabel('lambda')

        # Refine posterior grid evaluation points
        if it <= iter_interpolate:
            cumul_prob_dens = post.cumsum()
            f = interp1d(cumul_prob_dens, lam_range)
            cdf_new_grid_pts = np.linspace(1e-2, 1 - 1e-2, n_pts)
            lam_range = f(cdf_new_grid_pts)
Ejemplo n.º 12
0
    model = model  # instantiate the class to create an object for the input model

    # Two Cohorts are compared. 1. Streaming TV Not Subsribed by Users, 2. Streaming TV subscribed by the users.
    groups = data['StreamingTV']
    # group i1 , having the pandas series for the 1st cohort
    i1 = (groups == 'No')
    # group i2 , having the pandas series for the 2nd cohort
    i2 = (groups == 'Yes')

    # fit the model for 1st cohort
    model.fit(T[i1], E[i1], label='Not Subscribed StreamingTV')
    a1 = model.plot(ax=axes)

    # fit the model for 2nd cohort
    model.fit(T[i2], E[i2], label='Subscribed StreamingTV')
    model.plot(ax=axes)


# Churn by subscribe for the lognormal model
churn_by_subscribe(LogNormalFitter(), axes[0][0])
# Churn by subscribe for the weibull model
churn_by_subscribe(WeibullFitter(), axes[0][1])
# Churn by subscribe for the loglogistic model
churn_by_subscribe(LogLogisticFitter(), axes[1][0])
# Churn by subscribe for the Exponential model
churn_by_subscribe(ExponentialFitter(), axes[1][1])

# Function for adding subtitles and labels
plot_details('Subscribed', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
Ejemplo n.º 13
0
import pandas as pd

data = pd.read_csv('Dataset/telco_customer.csv')
data['tenure'] = pd.to_numeric(data['tenure'])
data = data[data['tenure'] > 0]

# Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data.
data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

T = data['tenure']
E = data['Churn']

wbf = WeibullFitter().fit(T, E, label='WeibullFitter')
ef = ExponentialFitter().fit(T, E, label='ExponentialFitter')
lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter')
llf = LogLogisticFitter().fit(T, E, label='LogLogisticFitter')

wbf.plot_cumulative_hazard(ax=axes[0][0])
ef.plot_cumulative_hazard(ax=axes[0][1])
lnf.plot_cumulative_hazard(ax=axes[1][0])
llf.plot_cumulative_hazard(ax=axes[1][1])

plt.suptitle(
    'Parametric Model Implementation of the Telco dataset using different models'
)

fig.text(0.5, 0.04, 'Timeline', ha='center')
fig.text(0.04, 0.5, 'Probability', va='center', rotation='vertical')
plt.savefig('Images/WeiExpLogx.jpeg')
Ejemplo n.º 14
0
def churn_by_gender(model, axes):
    model = model  # instantiate the class to create an object for required model

    groups = data['gender']
    # group i1 , having the pandas series for the 1st cohort
    j1 = (groups == 'Male')
    # group i2 , having the pandas series for the 2nd cohort
    j2 = (groups == 'Female')

    # fit the model for 1st cohort
    model.fit(T[j1], E[j1], label='Male')
    a1 = model.plot(ax=axes)

    # fit the model for 2nd cohort
    model.fit(T[j2], E[j2], label='Female')
    model.plot(ax=axes)


# Churn by gender for the lognormal model
churn_by_gender(LogNormalFitter(), axes[0][0])
# Churn by gender for the weibull model
churn_by_gender(WeibullFitter(), axes[0][1])
# Churn by gender for the loglogistic model
churn_by_gender(LogLogisticFitter(), axes[1][0])
# Churn by gender for the Exponential model
churn_by_gender(ExponentialFitter(), axes[1][1])

# Function for adding subtitles and labels
plot_details('Gender', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
Ejemplo n.º 15
0
    model = model  # instantiate the class to create an object for choosen model

    # Three cohorts are compared on the basis of the contract
    groups = data['Contract']
    x1 = (groups == 'Month-to-month')
    x2 = (groups == 'Two year')
    x3 = (groups == 'One year')

    model.fit(T[x1], E[x1], label='Month-to-month')
    ax = model.plot(ax=axes)

    model.fit(T[x2], E[x2], label='Two year')
    ax1 = model.plot(ax=axes)
    ac1 = model.plot

    model.fit(T[x3], E[x3], label='One year')
    model.plot(ax=axes)


# Churn by contract for the lognormal model
churn_by_contract(LogNormalFitter(), axes[0][0])
# Churn by contract for the weibull model
churn_by_contract(WeibullFitter(), axes[0][1])
# Churn by contract for the loglogistic model
churn_by_contract(LogLogisticFitter(), axes[1][0])
# Churn by contract for the Exponential model
churn_by_contract(ExponentialFitter(), axes[1][1])

# Function for adding subtitles and labels
plot_details('Contract', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)
                       LogLogisticFitter)

import pandas as pd
data = pd.read_csv('Dataset/telco_customer.csv')
data['tenure'] = pd.to_numeric(data['tenure'])
data = data[data['tenure'] > 0]

# Replace yes and No in the Churn column to 1 and 0. 1 for the event and 0 for the censured data.
data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

T = data['tenure']
E = data['Churn']

wbf = WeibullFitter().fit(T, E, label='WeibullFitter')
ef = ExponentialFitter().fit(T, E, label='ExponentialFitter')
lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter')
llf = LogLogisticFitter().fit(T, E, label='LogLogisticFitter')

wbf.plot_survival_function(ax=axes[0][0])
ef.plot_survival_function(ax=axes[0][1])
lnf.plot_survival_function(ax=axes[1][0])
llf.plot_survival_function(ax=axes[1][1])

plt.suptitle(
    'Implementation of  Paramteric Models to create survival functions on the teleco dataset'
)

fig.text(0.5, 0.04, 'Timeline', ha='center')
fig.text(0.04, 0.5, 'Probability', va='center', rotation='vertical')
plt.savefig('Images/SurvivalFunctions.jpeg')
Ejemplo n.º 17
0
def test_find_best_parametric_model_can_accept_other_models():
    T = np.random.exponential(2, 1000)
    model, score = utils.find_best_parametric_model(
        T, additional_models=[ExponentialFitter(),
                              ExponentialFitter()])
    assert True
Ejemplo n.º 18
0
def churn_by_partner(model, axes):
    model = model  # instantiate the class to create an object of required model

    groups = data['Partner']
    # group i1 , having the pandas series for the 1st cohort
    k1 = (groups == 'No')
    # group i2 , having the pandas series for the 2nd cohort
    k2 = (groups == 'Yes')

    # fit the model for 1st cohort
    model.fit(T[k1], E[k1], label='Do not have a partner')
    a1 = model.plot(ax=axes)

    # fit the model for 2nd cohort
    model.fit(T[k2], E[k2], label='Have a partner')
    model.plot(ax=axes)


# Churn by partner for the lognormal model
churn_by_partner(LogNormalFitter(), axes[0][0])
# Churn by partner for the weibull model
churn_by_partner(WeibullFitter(), axes[0][1])
# Churn by partner for the loglogistic model
churn_by_partner(LogLogisticFitter(), axes[1][0])
# Churn by partner for the Exponential model
churn_by_partner(ExponentialFitter(), axes[1][1])

# Function for adding subtitles and labels
plot_details('Partner', axes[0, 0], axes[0, 1], axes[1, 0], axes[1, 1], fig)