Example #1
0
    def test_marginal_kaplan_meier_curves(self):
        marginal_survival = MarginalSurvival(survival_model=None)
        marginal_survival.fit(self.X, self.a)
        marginal_curves_causallib = marginal_survival.estimate_population_outcome(
            self.X, self.a, self.t, self.y)

        marginal_survival_lifelines = MarginalSurvival(
            survival_model=lifelines.KaplanMeierFitter())
        marginal_survival_lifelines.fit(self.X, self.a)
        marginal_curves_causallib_lifelines = marginal_survival_lifelines.estimate_population_outcome(
            self.X, self.a, self.t, self.y)

        lifelines_km_a0 = lifelines.KaplanMeierFitter()
        lifelines_km_a0.fit(durations=self.t[self.a == 0],
                            event_observed=self.y[self.a == 0])
        lifelines_km_a1 = lifelines.KaplanMeierFitter()
        lifelines_km_a1.fit(durations=self.t[self.a == 1],
                            event_observed=self.y[self.a == 1])
        marginal_curves_lifelines = pd.DataFrame({
            0:
            lifelines_km_a0.predict(sorted(self.t.unique())),
            1:
            lifelines_km_a1.predict(sorted(self.t.unique()))
        })
        marginal_curves_lifelines.columns.name = 'a'
        marginal_curves_lifelines.index.name = 't'

        pd.testing.assert_frame_equal(marginal_curves_causallib,
                                      marginal_curves_causallib_lifelines)
        pd.testing.assert_frame_equal(marginal_curves_causallib,
                                      marginal_curves_lifelines)
Example #2
0
    def test_kmf_add_at_risk_counts_with_custom_subplot(self, block, kmf):
        # https://github.com/CamDavidsonPilon/lifelines/issues/991#issuecomment-614427882
        import lifelines
        import matplotlib as mpl
        from lifelines.datasets import load_waltons

        plt = self.plt
        waltons = load_waltons()
        ix = waltons["group"] == "control"

        img_no = 3

        height = 4 * img_no
        half_inch = 0.5 / height  # in percent height
        _fig = plt.figure(figsize=(6, height), dpi=100)
        gs = mpl.gridspec.GridSpec(img_no, 1)
        # plt.subplots_adjust(left=0.08, right=0.98, bottom=half_inch, top=1 - half_inch)

        for i in range(img_no):
            ax = plt.subplot(gs[i, 0])
            kmf_control = lifelines.KaplanMeierFitter()
            ax = kmf_control.fit(waltons.loc[ix]["T"], waltons.loc[ix]["E"], label="control").plot(ax=ax)
            kmf_exp = lifelines.KaplanMeierFitter()
            ax = kmf_exp.fit(waltons.loc[~ix]["T"], waltons.loc[~ix]["E"], label="exp").plot(ax=ax)
            ax = lifelines.plotting.add_at_risk_counts(kmf_exp, kmf_control, ax=ax)

        plt.subplots_adjust(hspace=0.6)
        plt.title("test_kmf_add_at_risk_counts_with_custom_subplot")
        plt.show(block=block)
Example #3
0
def censored_roc(data, pred_var, time_var, orig_var, dur_var, time_val):
    subset = data[data[time_var] == time_val]
    #KM for full sample
    km_full = lifelines.KaplanMeierFitter()
    km_full.fit(subset[dur_var], subset[orig_var])
    sf_full = list(km_full.survival_function_at_times(times=[time_val]))[0]
    #Getting reduced set of potential thresholds
    thresh = pd.unique(subset[pred_var].round(3))
    thresh.sort()
    thresh = np.flip(thresh)
    #Estimating Curves
    tpr = [0.0]
    fpr = [0.0]
    km_above = lifelines.KaplanMeierFitter()
    km_below = lifelines.KaplanMeierFitter()
    for tv in thresh[1:-1]:
        above_test = (subset[pred_var] > tv)
        #KM for sample above
        sub_above = subset[above_test]
        km_above.fit(sub_above[dur_var], sub_above[orig_var])
        sf_above = list(
            km_above.survival_function_at_times(times=[time_val]))[0]
        #KM for sample below
        sub_below = subset[~above_test]
        km_below.fit(sub_below[dur_var], sub_below[orig_var])
        sf_below = list(
            km_below.survival_function_at_times(times=[time_val]))[0]
        #Now calculating sens/spec
        prop_above = above_test.mean()
        sens = ((1 - sf_above) * prop_above) / (1 - sf_full)
        spec = (sf_below * (1 - prop_above)) / (sf_full)
        tpr.append(sens)
        fpr.append(1 - spec)
    tpr.append(1.0)
    fpr.append(1.0)
    roc_dat = pd.DataFrame(zip(fpr, tpr, thresh),
                           columns=['FPR', 'TPR', 'THRESH'])
    #Now fudging out places that are non-monotonic
    roc_new = roc_dat
    roc_new['FPR'] = roc_new['FPR'].round(3)
    roc_new['TPR'] = roc_new['TPR'].round(3)
    nonN, any_min = check_min(roc_new)
    while nonN > 0:
        roc_new = roc_new[~any_min].copy()
        nonN, any_min = check_min(roc_new)
    roc_new['Time'] = time_val
    try:
        auc_stat = metrics.auc(roc_new['FPR'], roc_new['TPR'])
    except:
        auc_stat = -1
    return roc_new, auc_stat
Example #4
0
def SurvivalPlot(surv_list,
                 event_list,
                 duration_list,
                 name_list,
                 legend_list,
                 fig=None,
                 is_show_KM=False,
                 store_folder=None):
    assert (len(surv_list) == len(event_list)
            and len(surv_list) == len(duration_list))
    if fig is None:
        fig = plt.figure()

    km = lifelines.KaplanMeierFitter()
    fig.clear()
    ax = fig.add_subplot(1, 1, 1)

    for index, (surv_df, event, duration, name, legend) in enumerate(
            zip(surv_list, event_list, duration_list, name_list, legend_list)):
        if is_show_KM:
            km.fit(duration, event, timeline=surv_df.index)
            km.plot_survival_function(color=color_list[index],
                                      ax=ax,
                                      ci_show=False,
                                      linestyle='--',
                                      label='{}-KM'.format(name))
        ax.step(surv_df.index,
                surv_df.values.mean(axis=1),
                color=color_list[index],
                label=legend)

    ax.legend()
    ax.set_ylabel('Survival Function')
    ax.set_xlabel('Time')
Example #5
0
 def all_source_plot(self, **kwargs):
     """
     KaplanMeier fit and plot, using baidutongji all_source dataframe as input
     :param kwargs:
     :return:
     """
     all_source = self.data_frame
     title = kwargs['title']
     path = kwargs['path']
     old = all_source[all_source['visitor'] == 'old']
     old_c = old.loc[:, 'avg_visit_time'].str.isdigit()
     old_cleaned = old[old_c].copy()
     new = all_source[all_source['visitor'] == 'new']
     new_c = new.loc[:, 'avg_visit_time'].str.isdigit()
     new_cleaned = new[new_c].copy()
     kmf = lifelines.KaplanMeierFitter()
     fig, ax = plt.subplots()
     kmf.fit(new_cleaned['avg_visit_time'], label="New Visitors")
     kmf.plot(ax=ax, show_censors=True)
     kmf.fit(old_cleaned['avg_visit_time'], label="Old Visitors")
     kmf.plot(ax=ax, show_censors=True)
     plt.ylim(0, 1)
     plt.title(title)
     plt.tight_layout()
     plt.savefig(path)
     plt.close('all')
Example #6
0
def plot_detect(filename, name, event_id, md):
    """
    What is the distribution of times that infection is first detected.
    """
    detection_times, none_detected = dataformat.first_of_event(
        filename, event_id)
    logger.info("Detected {0} times out of {1}".format(
        len(detection_times),
        len(detection_times) + none_detected))
    if len(detection_times) is 0:
        logger.info("The event {0} did not happen.".format(event_id))
        sys.exit(0)
    kmf = lifelines.KaplanMeierFitter()
    last = max(detection_times) + 1
    detection = np.hstack([
        np.array(detection_times), last * np.ones(
            (none_detected, ), dtype=np.double)
    ])
    P = [1] * len(detection_times) + [0] * none_detected
    kmf.fit(detection, P, label=name)
    ax = kmf.plot()
    ax.set_title(name)
    ax.set_xlabel("Days")
    ax.set_ylabel("Survival")
    SaveFig("{0}_survival.pdf".format(name), md)
    plt.clf()
    plt.close()
Example #7
0
def estimate_kaplan_meier(y, survival,
    duration_column='duration', observed_column='observed'):
    """Estimate survival curves for groups defined in y based on survival data in ``survival``

    Parameters
    ----------
    y:                  pd.Series, groups (clusters, subtypes). the index is
                        the sample names
    survival:           pd.DataFrame with the same index as y, with columns for
                        the duration (survival time for each patient) and whether
                        or not the death was observed. If the death was not
                        observed (sensored), the duration is the time of the last
                        followup.
    duration_column:        the name of the column in  ``survival`` with the duration
    observed_column:    the name of the column in ``survival`` with True/False values
                        for whether death was observed or not

    Returns
    -------
    km_estimates:       pd.DataFrame, index is the timeline, columns are survival
                        functions (estimated by Kaplan-Meier) for each class, as
                        defined in ``y``.
    """
    try:
        import lifelines
    except ImportError:
        raise ImportError('The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`.')
    kmf = lifelines.KaplanMeierFitter()
    sfs = dict()
    for cl in y.unique():
        ixs = list(set(y[y==cl].index) & set(survival.index))
        kmf.fit(survival.loc[ixs][duration_column],
            survival.loc[ixs][observed_column], label=cl)
        sfs[cl] = kmf.survival_function_
    return pd.concat([sfs[k] for k in sorted(y.unique())], axis=1).interpolate()
Example #8
0
    def run_survival(data, gene_name):
        ay = plt.subplot(111)
        ay.set_title(gene_name)

        gene = gene_name + '_expression'
        gene = ''.join(gene)
        genders = ['male', 'female']
        group_by = ['gender']
        group_by.append(gene)
        # print group_by
        gene_groups = ['Underexpressed', 'Overexpressed', 'Normal_expression']

        kmf = lifelines.KaplanMeierFitter()

        grouped_data = data.groupby(group_by)

        for gene_group in gene_groups:
            for gender in genders:
                try:
                    pre_tuple_list = [gender, gene_group]
                    group = tuple(pre_tuple_list)

                    # print 'tuple: ' + str(group)
                    d = grouped_data.get_group(group)
                    kaplan_meier_time = pd.to_numeric(d['time'])
                    kaplan_meier_event = d['death_status']

                    # TODO: Change label to display N
                    n_patients = [len(d)]
                    pre_tuple_list.append(n_patients)
                    label = str(pre_tuple_list)

                    kmf.fit(kaplan_meier_time, kaplan_meier_event, label=label)

                    kmf.plot(ax=ay, show_censors=True, ci_show=False)
                except KeyError:
                    # print "No " + str(gender) + ' in gene' + str(gene_group)
                    pass

        event_durations = data.as_matrix(columns=['time'])

        data['stat_col'] = data[gene] + data['gender']
        group_labels = data.as_matrix(columns=['stat_col'])

        event = numpy.array(data.as_matrix(columns=['death_status']))

        result = multivariate_logrank_test(event_durations, group_labels,
                                           event, 0.85)

        os.chdir(str(CD + '/' + cohort))
        if not os.path.exists(CD + '/' + cohort + '/results'):
            os.makedirs('results')
        os.chdir(str(CD + '/' + cohort + '/results'))

        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
        plt.savefig(str(gene_name + '_' + str(result.is_significant) + '.png'),
                    bbox_inches='tight')

        # f.close()
        plt.close()
Example #9
0
def usage_plot(cohorts):
	"""
	Plot usage participants in cohort
	
	
	"""
	kmf = lifelines.KaplanMeierFitter()
	norm_column = ['Anxiety', 'Mood', 'Psychosis', 'Sleep', 'Social', 'Medication']
	for cohort in cohorts:
		time_vals = []
		est_vals = []
		for day_group in cohort.groupby(cohort.index):
			#print(day_group[1][norm_column])
			for col in norm_column:
				time_vals += day_group[1][day_group[1][col].notnull()][col].index.tolist()
				est_vals += len(day_group[1][day_group[1][col].notnull()][col].index) * [1]

		kmf.fit(time_vals, est_vals)
		if 'ax' not in locals(): 
			ax = kmf.plot() 
		else: ax = kmf.plot(ax=ax)

	plt.xlabel('Day')
	plt.ylabel('Percentage of surveys remaining')
	#plt.show()
	plt.savefig('test_kmf.png')
Example #10
0
def plot_km_curve(df_tune, df_test):
    """Returns KM curves for each risk group for `df_test`.

  Risk groups are defined via thresholds computed on `df_tune`.

  Args:
    df_tune: a pd.DataFrame of tune set data.
    df_test: a pd.DataFrame of test set data.
  """
    # Compute risk groups
    df_test['risk_group'] = discretize(df_tune[RISK_SCORE],
                                       df_test[RISK_SCORE])

    # Plot KM curves per risk group
    fig, ax = plt.subplots()
    groups = ['Low Risk', 'Medium Risk', 'High Risk']
    kmfs = []
    for group in groups:
        kmf = lifelines.KaplanMeierFitter()
        df_group = df_test.query(f"risk_group=='{group}'")
        if df_group.empty:
            continue
        kmf.fit(df_group[TIME], event_observed=df_group[OBSERVED], label=group)
        kmf.plot(ax=ax)
        kmfs.append(kmf)
    lifelines.plotting.add_at_risk_counts(*kmfs, ax=ax)
    return fig
Example #11
0
def event_table(player_name, style):
    """Create an event table of the batsman's innings"""
    df = runs_df(player_name, style)
    time, event = df.Runs, df.Out
    kmf = lifelines.KaplanMeierFitter().fit(time, event)
    event_table = kmf.event_table
    event_table['Name'] = player_name
    return event_table
	def __init__(self, sdataMatrix):
		#print data
		KM = ll.KaplanMeierFitter()
		kmf = KM.fit(sdataMatrix[:,2], event_observed=sdataMatrix[:,1]).survival_function_
		self._kmf = np.zeros((np.shape(kmf)[0] ,2))
		self._kmf[:,0] = np.asarray(list(kmf.index))
		self._kmf[:,1] = list(np.asarray(kmf))
		self._predict_event = None
Example #13
0
 def fit(self, X, B, T):
     kmf = lifelines.KaplanMeierFitter()
     kmf.fit(T, event_observed=B)
     self.ts = kmf.survival_function_.index.values
     self.ps = 1.0 - kmf.survival_function_['KM_estimate'].values
     self.ps_hi = 1.0 - kmf.confidence_interval_[
         'KM_estimate_lower_0.95'].values
     self.ps_lo = 1.0 - kmf.confidence_interval_[
         'KM_estimate_upper_0.95'].values
Example #14
0
def compare_unit_survival(infect0, infect1, unit, traj_cnt0, traj_cnt1,
                          when_max, md):
    kmf = lifelines.KaplanMeierFitter()
    ax = plot_unit_survival(kmf, None, infect0, traj_cnt0, when_max,
                            "Continuous")
    plot_unit_survival(kmf, ax, infect1, traj_cnt1, when_max, "NAADSM")
    SaveFig("unit_survival{0}.pdf".format(unit), md)
    plt.clf()
    plt.close()
Example #15
0
def categorical_km_curves(feature,
                          t='hour',
                          event='survive',
                          df=data,
                          ax=None):
    for cat in sorted(data[feature].unique(), reverse=True):
        idx = data[feature] == cat
        kmf = lifelines.KaplanMeierFitter()
        kmf.fit(data[idx][t], event_observed=data[idx][event] == 0, label=cat)
        kmf.plot(ax=ax, label=cat, ci_show=False, c=colours[cat])
Example #16
0
def km_median(values, censored, censorship='upper'):
    kmf = lifelines.KaplanMeierFitter()
    if censorship == 'upper':
        kmf.fit_left_censoring(values, censored)
        return kmf.median_survival_time_
    elif censorship == 'lower':
        kmf.fit(values, censored)
        return kmf.median_survival_time_
    else:
        print('error')
        return
Example #17
0
def test_kaplan_meier_against_lifelines():
    kmf = lifelines.KaplanMeierFitter()
    for i in range(100):
        test_params = []
        for b in ((1, 100), (0.5, 20)):
            test_params.append(np.random.uniform(*b))
        test_params = np.array(test_params)
        x = surpyval.Weibull.random(int(np.random.uniform(2, 1000, 1)), *test_params)
        n = np.ones_like(x) * int(np.random.uniform(1, 5))
        x_test = np.random.uniform(x.min()/2, x.max()*2, 100)
        ll_est = kmf.fit(x, weights=n).predict(x_test).values
        surp_est = surpyval.KaplanMeier.fit(x, n=n).sf(x_test)
        if not np.allclose(ll_est, surp_est, 1e-15):
            raise AssertionError('Kaplan-Meier different to lifelines?!')
Example #18
0
def kaplan_plot(dataframe, group_col=None, event_col='TTE',
                observed_col='OBS', xlim=None, ax=None):
    """
    Creates a Kaplan-Meier plot for each group in `group_col`

    Parameters
    ----------
    dataframe : DataFrame
        Data to use for plots
    group_col : str, optional
        Groups to plot. If not separating by group, use a column
        with a single string value
    event_col : str, optional
        Name of the time to event column
    observed_col : str, optional
        Name of the event censoring column. 1 = event observed, 0 otherwise
    xlim : int
        Length of x-axis for plot
    ax : axis, optional
        If adding to an existing plot, set this to the existing ax value

    Returns
    -------
    None
        Call to plt.plot() of Kaplan-Meier estimated survival curve
    """

    kmf = lifelines.KaplanMeierFitter()

    if group_col is not None:
        add = ' by ' + group_col
        for group in dataframe[group_col].unique():
            grp = (dataframe[group_col] == group)
            kmf.fit(dataframe.loc[grp, event_col],
                    event_observed=dataframe.loc[grp, observed_col],
                    label=group)
            if ax is None:
                ax = kmf.plot()
            else:
                ax = kmf.plot(ax=ax)
    else:
        add = ''
        kmf.fit(dataframe[event_col], event_observed=dataframe[observed_col])
        ax = kmf.plot()

    if xlim is not None:
        ax.set_xlim(left=0, right=xlim)

    plt.title('Estimated Survival Curve' + add)
Example #19
0
def basic_survival(df):
    T = df["duration"]
    E = df["degraded_obs"]
    kmf = ll.KaplanMeierFitter()
    model = kmf.fit(durations=T, event_observed=E)
    model.plot(figsize=(9, 8))
    plt.title(
        'Survival Function of Bridges over Time: Pooled Data Across all Bridges',
        fontsize=18)
    plt.savefig(
        '/Users/ian/Documents/exploratory/bridges/reports/figures/basic_survival.png'
    )
    plt.show()
    plt.clf()
    plt.close()
Example #20
0
    def sf_KM(self, t_point):

        KM = ll.KaplanMeierFitter()
        kmf = KM.fit(self._sortedMatrix[:,2], event_observed=self._sortedMatrix[:,1]).survival_function_
        self._kmf = np.zeros((np.shape(kmf)[0] ,2))
        self._kmf[:,0] = np.asarray(list(kmf.index))
        self._kmf[:,1] = list(np.asarray(kmf))

        for i_t in range(len(self._kmf)):
            bl_sur = 1.0
            if self._kmf[i_t, 0] > t_point:
                bl_sur = self._kmf[i_t-1, 1]
                break
        return bl_sur

        '''      
Example #21
0
    def test_weighted_kaplan_meier_curves(self):
        weighted_survival = WeightedSurvival(weight_model=IPW(
            LogisticRegression(max_iter=10000, C=10), use_stabilized=True),
                                             survival_model=None)
        weighted_survival.fit(self.X, self.a)
        curves_causallib = weighted_survival.estimate_population_outcome(
            self.X, self.a, self.t, self.y)

        weighted_survival_lifelines_km = WeightedSurvival(
            weight_model=IPW(LogisticRegression(max_iter=10000, C=10),
                             use_stabilized=True),
            survival_model=lifelines.KaplanMeierFitter())
        weighted_survival_lifelines_km.fit(self.X, self.a)
        curves_causallib_lifelines = weighted_survival_lifelines_km.estimate_population_outcome(
            self.X, self.a, self.t, self.y)

        np.testing.assert_array_almost_equal(curves_causallib,
                                             curves_causallib_lifelines,
                                             decimal=8)
Example #22
0
def disease_comparison(times0, times1, name, md):
    logger.debug("times0 len {0} times1 len {1}".format(
        len(times0), len(times1)))
    plt.clf()
    fig = plt.figure(1, figsize=(4, 3))
    ax = fig.add_subplot(111)
    kmf = lifelines.KaplanMeierFitter()
    logger.info("Truncating times at 50.")
    for tidx in range(len(times0)):
        if times0[tidx] > 50:
            times0[tidx] = 50
    P0 = [1] * len(times0)
    kmf.fit(times0, P0, label="Continuous")
    ax = kmf.plot(ax=ax)
    ax.set_title(name)
    P1 = [1] * len(times1)
    kmf.fit(times1, P1, label="NAADSM")
    kmf.plot(ax=ax)
    plt.tight_layout()
    SaveFig("disease_comparison{0}.pdf".format(name), md)
def stratifiedSurvival(t,
                       eventTime,
                       eventIndicator=None,
                       followupTime=None,
                       group=None):

    import matplotlib.pyplot as plt
    import lifelines as lf
    from lifelines.plotting import add_at_risk_counts
    import pandas as pd
    import copy

    tm = t[eventTime].copy()

    if (group is None):
        grp = pd.Series('Population', index=t.index)
    else:
        grp = t[group]

    if (eventIndicator is None):
        ev = ~t[eventTime].isnull()
        tm[tm.isnull()] = t.loc[tm.isnull(), followupTime]

    ######### Kaplan Meier curves stratified by sex
    kl = list()
    kmf = lf.KaplanMeierFitter()
    fig, ax = plt.subplots()
    for g in set(grp):
        kmf.fit(tm[grp == g], ev[grp == g], label=g)
        kmf.plot(ax=ax)
        kl.append(copy.deepcopy(kmf))

    add_at_risk_counts(*kl, ax=ax)

    plt.legend(loc='lower left')
    plt.ylim([0, 1])
    plt.xlabel('Time (years)')
    plt.ylabel('Survival')
    plt.title('Kaplan-Meier survival curve')
Example #24
0
def make_km(tv_data, label='Untitled', endpoint=700):
    """Construct a Kaplan-Meier function for a dataframe
    of tumour volume measurements
    
    Arguments:
    
        tv_data  - a pandas data frame of volume measurements
                   with individuals in columns and timepoints
                   as rows.  Individuals are removed from study
                   at the first NaN timepoint
    
        label    - a title for this grouping
    
        endpoint - the volume at which the endpoint is reached
                   Default: 700
    
    Returns:
    
        a lifelines KaplanMeierFitter object
    """
    survival = volume_to_survival(tv_data, endpoint=endpoint)
    kmf = lifelines.KaplanMeierFitter()
    kmf.fit(survival['Time'], event_observed=survival['Observed'], label=label)
    return kmf
Example #25
0
    # add in the time since column
    fch['time_until_refactor'] = 0
    for idx, row in fch.iterrows():
        ts = None
        chunk = fch[(fch['timestamp'] > row.timestamp) & (fch['refactor'] == 1)
                    & (fch['filename'] == row.filename)]
        if chunk.shape[0] > 0:
            ts = chunk['timestamp'].min()
            fch.set_value(idx, 'observed', True)
        else:
            ts = fch['timestamp'].max()
        fch.set_value(idx, 'time_until_refactor', ts - row.timestamp)

    # plot out some survival curves
    fig = plt.figure()
    ax = plt.subplot(111)
    for filename in set(fch['file_owner'].values):
        sample = fch[fch['file_owner'] == filename]
        if sample.shape[0] > 20:
            print('Evaluating %s' % (filename, ))
            kmf = lifelines.KaplanMeierFitter()
            kmf.fit(sample['time_until_refactor'].values,
                    event_observed=sample['observed'],
                    timeline=list(range(365)),
                    label=filename)
            ax = kmf.survival_function_.plot(ax=ax)

    plt.title('Survival function of file owners (thres=%s)' % (threshold, ))
    plt.xlabel('Lifetime (days)')
    plt.show()
Example #26
0
    def hazard2KMCurve(data, subtype):
        p = np.percentile(data['Hazard'], [33, 66])
        if p[0] == p[1]: p[0] = 2.99997
        data.insert(0, 'grade_pred',
                    [hazard2grade(hazard, p) for hazard in data['Hazard']])
        kmf_pred = lifelines.KaplanMeierFitter()
        kmf_gt = lifelines.KaplanMeierFitter()

        def get_name(model):
            mode2name = {
                'pathgraphomic': 'Pathomic F.',
                'pathomic': 'Pathomic F.',
                'graphomic': 'Pathomic F.',
                'path': 'Histology CNN',
                'graph': 'Histology GCN',
                'omic': 'Genomic SNN'
            }
            for mode in mode2name.keys():
                if mode in model: return mode2name[mode]
            return 'N/A'

        fig = plt.figure(figsize=(10, 10), dpi=600)
        ax = plt.subplot()
        censor_style = {'ms': 20, 'marker': '+'}

        temp = data[data['Grade'] == 0]
        kmf_gt.fit(temp['Survival months'] / 365,
                   temp['censored'],
                   label="Grade II")
        kmf_gt.plot(ax=ax,
                    show_censors=True,
                    ci_show=False,
                    c='g',
                    linewidth=3,
                    ls='--',
                    markerfacecolor='black',
                    censor_styles=censor_style)
        temp = data[data['grade_pred'] == 0]
        kmf_pred.fit(temp['Survival months'] / 365,
                     temp['censored'],
                     label="%s (Low)" % get_name(model))
        kmf_pred.plot(ax=ax,
                      show_censors=True,
                      ci_show=False,
                      c='g',
                      linewidth=4,
                      ls='-',
                      markerfacecolor='black',
                      censor_styles=censor_style)

        temp = data[data['Grade'] == 1]
        kmf_gt.fit(temp['Survival months'] / 365,
                   temp['censored'],
                   label="Grade III")
        kmf_gt.plot(ax=ax,
                    show_censors=True,
                    ci_show=False,
                    c='b',
                    linewidth=3,
                    ls='--',
                    censor_styles=censor_style)
        temp = data[data['grade_pred'] == 1]
        kmf_pred.fit(temp['Survival months'] / 365,
                     temp['censored'],
                     label="%s (Mid)" % get_name(model))
        kmf_pred.plot(ax=ax,
                      show_censors=True,
                      ci_show=False,
                      c='b',
                      linewidth=4,
                      ls='-',
                      censor_styles=censor_style)

        if subtype != 'ODG':
            temp = data[data['Grade'] == 2]
            kmf_gt.fit(temp['Survival months'] / 365,
                       temp['censored'],
                       label="Grade IV")
            kmf_gt.plot(ax=ax,
                        show_censors=True,
                        ci_show=False,
                        c='r',
                        linewidth=3,
                        ls='--',
                        censor_styles=censor_style)
            temp = data[data['grade_pred'] == 2]
            kmf_pred.fit(temp['Survival months'] / 365,
                         temp['censored'],
                         label="%s (High)" % get_name(model))
            kmf_pred.plot(ax=ax,
                          show_censors=True,
                          ci_show=False,
                          c='r',
                          linewidth=4,
                          ls='-',
                          censor_styles=censor_style)

        ax.set_xlabel('')
        ax.set_ylim(0, 1)
        ax.set_yticks(np.arange(0, 1.001, 0.5))

        ax.tick_params(axis='both', which='major', labelsize=40)
        plt.legend(fontsize=32,
                   prop=font_manager.FontProperties(family='Arial',
                                                    style='normal',
                                                    size=32))
        if subtype != 'idhwt_ATC': ax.get_legend().remove()
        return fig
Example #27
0
def compare_interior_kaplan(obs,
                            var_pair,
                            rescale_kaplan=False,
                            rescale_interior=False):
    """
    Interior vs kaplan est for `multi_locus_analysis.finite_window.ab_window`.

    Compare the Kaplan-Meier estimator to the empirical distribution function
    (eCDF) of interior times of data generated using the
    `multi_locus_analysis.finite_window.ab_window` or
    `multi_locus_analysis.finite_window.ab_window_fast` functions.
    """
    kmfs = {}
    for name, state in obs.groupby('state'):
        times = state['wait_time'].values
        not_censored = (state['wait_type'] == 'interior').values
        kmfs[name] = lifelines.KaplanMeierFitter().fit(
            times,
            event_observed=not_censored,
            label=r'Meier-Kaplan Estimator, $\pm$95% conf int')

    fig, axs = _get_axes(var_pair,
                         name='two-by-half column, four legend entries above')
    T = obs.window_size.max()
    for var in var_pair:
        ax = axs[var.name]

        # extract KM CDF fit
        tk = kmfs[var.name].cumulative_density_.index.values
        kmf = kmfs[var.name].cumulative_density_.values
        # and confidence intervals
        low, high = kmfs[var.name] \
            .confidence_interval_cumulative_density_.values.T
        Z = kmf[-1] / var.cdf(T) if rescale_kaplan else 1
        km_l = ax.plot(tk, kmf / Z, color=km_color, label='Kaplan-Meier')[0]
        ax.fill_between(tk, low / Z, high / Z, color=km_color, alpha=0.4)

        # plot actual distribution
        t = np.linspace(0, T, 101)
        analytical_l, = ax.plot(t, var.cdf(t), color='k', label='Actual CDF')

        # now compute the empirical distribution of the "interior" times
        interior, _ = _int_win_from_obs(obs, var.name)
        x, cdf = fw.ecdf(interior, pad_left_at_x=0)

        Z = 1 / var.cdf(x[-1]) if rescale_interior else 1
        interior_l, = ax.plot(x,
                              cdf / Z,
                              c=var.color,
                              ls=interior_linestyle,
                              label='"Interior" eCDF')

        # prettify the plot
        ax.set_xlim([0, T])
        ax.set_ylim([0, 1])
        ax.set_xlabel('time')
        ax.set_ylabel(r'Cumulative probability')

        ax.legend(
            title=var.pretty_name,
            handles=[interior_l, km_l, analytical_l],
            # align bottom of legend 2% ax height above axis, filling full axis
            # width
            bbox_to_anchor=(0., 1.02, 1., .102),
            loc='lower left',
            ncol=1,
            mode="expand",
            borderaxespad=0.)
    return fig
Example #28
0
def plot_km_recs_antirecs(T, E, recommendation_idx, fig=None, ax=None, xlim=None, ylim=None, show_risk=False):
    """
    Plot KM curves for (anti)recommendation patients.
    
    Parameters
    ----------
    T: pandas DataFrame
        It needs to have column 'T'
    E: pandas DataFrame
        It needs to have column 'E'
    recommendation_idx: boolean array
        Array as given by get_recs_antirecs_index. It is True for
        recommendation patients. 
    fig: figure handle (optional)
    ax: axes handle (optional)
    xlim: list (two elements, optional)
        x-axis boundaries.
    ylim: list (two elements, optional)
        y-axis boundaries. If left as None, defaults to [0, 1]
    show_risk: boolean (optional)
        Indicate if the number of patients at risk should be included below
        the axis (True) or not (False, default).
            
    Returns
    -------
    tuple
        The first element corresponds to the figure handle.
        The second element correpsonds to the axes handle.
    """
    
    # Create figure (if necessary).
    if (fig is None) and (ax is None):
        fig, ax = plt.subplots(1, 1, figsize=[12, 6])
    elif fig is None:
        fig = ax.get_figure()
    elif ax is None:
        ax = fig.gca()

    # Initialize variables.
    kmf_list = []
    T_list = []
    C_list = []
    
    # For each label, apply KMF and plot.        
    labels = ['recommendation', 'anti-recommendation']
    
    for label in labels:

        # Perform proper selection.
        if label=='recommendation':
            T_curr = T.loc[recommendation_idx, :]
            E_curr = E.loc[recommendation_idx, :]
        elif label=='anti-recommendation':
            T_curr = T.loc[~recommendation_idx, :]
            E_curr = E.loc[~recommendation_idx, :]

        # Create Kaplan Meier Fitter and fit.
        kmf = lifelines.KaplanMeierFitter()
        kmf.fit(T_curr, E_curr, label=label.capitalize())
        
        # Plot KM curve.
        ax = kmf.plot(ax=ax, linewidth=5, legend=True)
        ax.legend(loc='best', frameon=False, fontsize='small')

        kmf_list.append(kmf)
        T_list.append(T_curr)
        C_list.append(E_curr)

    
    # Perform statistical analysis (log-rank test).
    results = lifelines.statistics.logrank_test(T_list[0], T_list[1], C_list[0], C_list[1], alpha=0.95)
    results.print_summary(style='ascii', decimals=4)

    # Calculate p-value text position and display.
    if ylim==None:
        y_pos = 0.1
    else:
        y_pos = 0.1 + min(ylim) + ((max(ylim) - min(ylim))*0.1)
        
    if results.p_value < 0.001:
        p_value_text = "$p$ < 0.001"
    else:
        p_value_text = f"$p$ = {results.p_value:.4f}"
    ax.text(T['T'].min()*10, y_pos, p_value_text, fontsize='small')
    
    # Format x-axis ticks here.
    # xticks = np.arange(T['T'].min(), T['T'].max())
    # xticks_float = xticks
    # xticks_floor = np.floor(xticks_float)
    # xticks_ceil = np.ceil(xticks_float)
    # xticks = np.unique(np.concatenate([xticks_floor, xticks_ceil], axis=None))
    # # Remove unnecesary ticks.
    # ax.set_xticks(xticks)
    # ax.set_xticklabels(xticks.astype(int))
    if xlim!=None:
        ax.set_xlim(np.array(xlim))    
    if ylim!=None:
        ax.set_ylim(ylim)
    else:
        ax.set_ylim([0, 1])
    ax.set_ylabel("Survival probability", weight='bold')
    
    # Add risk counts.
    if show_risk:
        lifelines.plotting.add_at_risk_counts(kmf_list[0], kmf_list[1], ax=ax)
        
    # X-axis label is set here to be sure it is show correctly even if
    # patients at risk will be shown.
    ax.set_xlabel("Time", weight='bold')
        
    return fig, ax
def _example_pareto_alpha(V_T_N):
    import multi_locus_analysis.finite_window as fw
    import multi_locus_analysis.plotting.finite_window as fplt

    # unpack parameters first
    (betas, xmin), T, N_traj = V_T_N
    var_pair = [
        fplt.Variable(scipy.stats.pareto(beta, scale=xmin),
                      name=f'Pareto({beta:0.3g})') for beta in betas
    ]
    # run one simulation
    sim = fw.ab_window([var.rvs for var in var_pair],
                       offset=-100 * np.sum([var.mean() for var in var_pair]),
                       window_size=T,
                       num_replicates=N_traj,
                       states=[var.name for var in var_pair])
    obs = fw.sim_to_obs(sim)

    # now extract alpha several different ways
    true_alpha = {var.name: var.args[0] + 1 for var in var_pair}
    mle_interior_est = {}
    mle_uncensored_baseline = {}
    fit_interior = {}
    fit_corrected = {}
    fit_kaplan = {}
    fit_uncensored_baseline = {}
    for var in var_pair:
        # mle, interior
        try:
            interior, windows = fplt._int_win_from_obs(obs, var.name)
            num_obs = len(interior)
            mle_interior_est[var.name] = _mla_stats.power_law_slope_mle(
                interior, xmin, num_obs)
        except:
            mle_interior_est[var.name] = np.nan
        # fit, interior
        try:
            x_int, cdf_int = fw.ecdf_windowed(interior, windows)
            fit_interior[var.name] = _alpha_from_cdf(x_int, cdf_int, xmin)
        except:
            fit_interior[var.name] = np.nan
        # fit, corrected
        try:
            exterior = fplt._ext_from_obs(obs, var.name)
            bin_centers, final_cdf = fw.ecdf_combined(exterior, interior, T)
            fit_corrected[var.name] = _alpha_from_cdf(bin_centers, final_cdf,
                                                      xmin)
        except:
            fit_corrected[var.name] = np.nan
        # fit, kaplan
        try:
            times = np.concatenate([interior, exterior])
            is_interior = np.concatenate(
                [np.ones_like(interior),
                 np.zeros_like(exterior)]).astype(bool)
            kmf = lifelines.KaplanMeierFitter() \
                .fit(times, event_observed=is_interior)
            x_kap = kmf.cumulative_density_.index.values
            cdf_kap = kmf.cumulative_density_.values.flatten()
            fit_kaplan[var.name] = _alpha_from_cdf(x_kap, cdf_kap, xmin)
        except:
            fit_kaplan[var.name] = np.nan
        # mle, uncensored baseline
        try:
            uncensored_obs = var.rvs(size=(num_obs, ))
            mle_uncensored_baseline[var.name] = _mla_stats.power_law_slope_mle(
                uncensored_obs, xmin, num_obs)
        except:
            mle_uncensored_baseline[var.name] = np.nan
        # fit, uncensored baseline
        try:
            x_unc, cdf_unc = _mla_stats.ecdf(uncensored_obs, pad_left_at_x=0)
            fit_uncensored_baseline[var.name] = \
                _alpha_from_cdf(x_unc, cdf_unc, xmin)
        except:
            fit_uncensored_baseline[var.name] = np.nan
    df = pd.concat(map(pd.Series, [
        true_alpha, mle_interior_est, mle_uncensored_baseline, fit_interior,
        fit_corrected, fit_kaplan, fit_uncensored_baseline
    ]),
                   axis=1)
    df.columns = [
        'true', 'mle-interior', 'mle-uncensored', 'fit-interior',
        'fit-corrected', 'fit-kaplan', 'fit-uncensored'
    ]
    return df
def _example_lambda_fit(V_T_N):
    import multi_locus_analysis.finite_window as fw
    import multi_locus_analysis.plotting.finite_window as fplt
    lambdas, T, N_traj = V_T_N
    var_pair = [
        fplt.Variable(expon(scale=lam), name=f"Exp({lam})") for lam in lambdas
    ]
    sim = fw.ab_window([var.rvs for var in var_pair],
                       offset=-100 * np.sum([var.mean() for var in var_pair]),
                       window_size=T,
                       num_replicates=N_traj,
                       states=[var.name for var in var_pair])
    obs = fw.sim_to_obs(sim)

    mean_est = fw.average_lifetime(obs)
    true_mean = {var.name: var.mean() for var in var_pair}
    naive_slope_est = {}
    correct_slope_est = {}
    kaplan_slope_est = {}
    uncensored_baseline = {}
    for var in var_pair:
        # naive
        interior, windows = fplt._int_win_from_obs(obs, var.name)
        try:
            x_int, cdf_int = fw.ecdf_windowed(interior, windows)
            naive_slope_est[var.name] = _mean_from_exp_cdf(x_int, cdf_int)
        except:
            naive_slope_est[var.name] = np.nan
        # corrected
        exterior = fplt._ext_from_obs(obs, var.name)
        try:
            bin_centers, final_cdf = fw.ecdf_combined(exterior, interior, T)
            correct_slope_est[var.name] = _mean_from_exp_cdf(
                bin_centers, final_cdf)
        except:
            correct_slope_est[var.name] = np.nan
        # kaplan
        times = np.concatenate([interior, exterior])
        is_interior = np.concatenate(
            [np.ones_like(interior),
             np.zeros_like(exterior)]).astype(bool)
        try:
            kmf = lifelines.KaplanMeierFitter() \
                    .fit(times, event_observed=is_interior)
            x_kap = kmf.cumulative_density_.index.values
            cdf_kap = kmf.cumulative_density_.values.flatten()
            kaplan_slope_est[var.name] = _mean_from_exp_cdf(x_kap, cdf_kap)
        except:
            kaplan_slope_est[var.name] = np.nan
        # uncensored baseline
        num_obs = len(interior)
        try:
            x_unc, cdf_unc = _mla_stats.ecdf(var.rvs(size=(num_obs, )),
                                             pad_left_at_x=0)
            uncensored_baseline[var.name] = _mean_from_exp_cdf(x_unc, cdf_unc)
        except:
            uncensored_baseline[var.name] = np.nan
    df = pd.concat(map(pd.Series, [
        true_mean, correct_slope_est, naive_slope_est, mean_est,
        kaplan_slope_est, uncensored_baseline
    ]),
                   axis=1)
    df.columns = [
        'true', 'corrected', 'naive', 'count-based', 'kaplan', 'uncensored'
    ]
    return df