Esempio n. 1
0
cph.predict_partial_hazard(test_X)
survival_result = cph.predict_survival_function(test_X)
survival_result = survival_result[survival_result <= 0.5]
LOSResult = pd.DataFrame(np.arange(1354).reshape((677, 2)),
                         columns=['Id', 'LOS'])
i = 0
for c in survival_result.columns:
    item = survival_result[c].idxmax()
    LOSResult.iloc[i, 0] = i
    LOSResult.iloc[i, 1] = item
    i = i + 1
test['Id'] = test.index
test1 = pd.merge(test, LOSResult, on='Id')
fig, ax = plt.subplots(figsize=(12, 12))
from lifelines import KaplanMeierFitter
kmf_control = KaplanMeierFitter()
ax = kmf_control.fit(test1['术后住院时间'], label='Real').plot(ax=ax,
                                                         color='#C32B4A')
kmf_exp = KaplanMeierFitter()
ax = kmf_exp.fit(test1['LOS'], label='Predicted').plot(ax=ax, color='#3F76B4')
font2 = {
    'family': 'Times New Roman',
    'weight': 'normal',
    'size': 28,
}
plt.xlabel('Postoperative hospital stay (days)', font2)
plt.ylabel('Percent hospitalized', font2)
ax.spines['left'].set_position(('outward', 0.2))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_position(('outward', 0.2))
Esempio n. 2
0
def qq_plot(model, ax=None, **plot_kwargs):
    """
    Produces a quantile-quantile plot of the empirical CDF against
    the fitted parametric CDF. Large deviances away from the line y=x
    can invalidate a model (though we expect some natural deviance in the tails).

    Parameters
    -----------
    model: obj
        A fitted lifelines univariate parametric model, like ``WeibullFitter``
    plot_kwargs:
        kwargs for the plot.

    Returns
    --------
    ax:
        The axes which was used.

    Examples
    ---------

    >>> from lifelines import *
    >>> from lifelines.plotting import qq_plot
    >>> from lifelines.datasets import load_rossi
    >>> df = load_rossi()
    >>> wf = WeibullFitter().fit(df['week'], df['arrest'])
    >>> qq_plot(wf)


    """
    from lifelines.utils import qth_survival_times
    from lifelines import KaplanMeierFitter

    if ax is None:
        ax = plt.gca()

    dist = get_distribution_name_of_lifelines_model(model)
    dist_object = create_scipy_stats_model_from_lifelines_model(model)

    COL_EMP = "empirical quantiles"
    COL_THEO = "fitted %s quantiles" % dist

    if CensoringType.is_left_censoring(model):
        kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_right_censoring(model):
        kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP)
    elif CensoringType.is_interval_censoring(model):
        raise NotImplementedError("lifelines does not have a non-parametric interval model yet.")

    q = np.unique(kmf.cumulative_density_.values[:, 0])
    # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)`
    quantiles = qth_survival_times(1 - q, kmf.survival_function_)
    quantiles[COL_THEO] = dist_object.ppf(q)
    quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna()

    max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min()

    quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax)
    ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0)
    ax.set_ylim(min_, max_)
    ax.set_xlim(min_, max_)

    return ax
Esempio n. 3
0
def survival_difference_at_fixed_point_in_time_test(
        point_in_time,
        durations_A,
        durations_B,
        event_observed_A=None,
        event_observed_B=None,
        **kwargs) -> StatisticalResult:
    """

    Often analysts want to compare the survival-ness of groups at specific times, rather than comparing the entire survival curves against each other.
    For example, analysts may be interested in 5-year survival. Statistically comparing the naive Kaplan-Meier points at a specific time
    actually has reduced power (see [1]). By transforming the Kaplan-Meier curve, we can recover more power. This function uses
    the log(-log) transformation.


    Parameters
    ----------
    point_in_time: float,
        the point in time to analyze the survival curves at.

    durations_A: iterable
        a (n,) list-like of event durations (birth to death,...) for the first population.

    durations_B: iterable
        a (n,) list-like of event durations (birth to death,...) for the second population.

    event_observed_A: iterable, optional
        a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the first population.
        Default assumes all observed.

    event_observed_B: iterable, optional
        a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the second population.
        Default assumes all observed.

    kwargs:
        add keywords and meta-data to the experiment summary


    Returns
    -------

    StatisticalResult
      a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary``

    Examples
    --------
    .. code:: python

        T1 = [1, 4, 10, 12, 12, 3, 5.4]
        E1 = [1, 0, 1,  0,  1,  1, 1]

        T2 = [4, 5, 7, 11, 14, 20, 8, 8]
        E2 = [1, 1, 1, 1,  1,  1,  1, 1]

        from lifelines.statistics import survival_difference_at_fixed_point_in_time_test
        results = survival_difference_at_fixed_point_in_time_test(12, T1, T2, event_observed_A=E1, event_observed_B=E2)

        results.print_summary()
        print(results.p_value)        # 0.893
        print(results.test_statistic) # 0.017

    Notes
    -----
    Other transformations are possible, but Klein et al. [1] showed that the log(-log(c)) transform has the most desirable
    statistical properties.

    References
    -----------

    [1] Klein, J. P., Logan, B. , Harhoff, M. and Andersen, P. K. (2007), Analyzing survival curves at a fixed point in time. Statist. Med., 26: 4505-4519. doi:10.1002/sim.2864

    """

    kmfA = KaplanMeierFitter().fit(durations_A,
                                   event_observed=event_observed_A)
    kmfB = KaplanMeierFitter().fit(durations_B,
                                   event_observed=event_observed_B)

    sA_t = kmfA.predict(point_in_time)
    sB_t = kmfB.predict(point_in_time)

    # this is doing a prediction/interpolation between the kmf's index.
    sigma_sqA = interpolate_at_times_and_return_pandas(kmfA._cumulative_sq_,
                                                       point_in_time)
    sigma_sqB = interpolate_at_times_and_return_pandas(kmfB._cumulative_sq_,
                                                       point_in_time)

    log = np.log
    clog = lambda s: log(-log(s))

    X = (clog(sA_t) - clog(sB_t))**2 / (sigma_sqA / log(sA_t)**2 +
                                        sigma_sqB / log(sB_t)**2)
    p_value = _chisq_test_p_value(X, 1)

    return StatisticalResult(
        p_value,
        X,
        null_distribution="chi squared",
        degrees_of_freedom=1,
        point_in_time=point_in_time,
        test_name="survival_difference_at_fixed_point_in_time_test",
        **kwargs)
Esempio n. 4
0
def plot3(df):
    import sys
    #get_ipython().system('{sys.executable} -m pip install lifelines')

    #install pandas and matlab plot

    import pandas as pd
    import matplotlib.pyplot as plt

    from lifelines import KaplanMeierFitter

    # import os
    # os.chdir("/Users/MDONEGAN/Downloads")

    #survival= pd.read_csv("/Users/MDONEGAN/Downloads/Book2.csv", sep=',')
    survival = df

    from lifelines.statistics import pairwise_logrank_test

    results = pairwise_logrank_test(survival['time'], survival['group'],
                                    survival['event'])

    results.print_summary()

    #%%
    # this util converts a table with "death" and "censored" (alive) into  the lifelines format

    from lifelines import KaplanMeierFitter
    from lifelines.utils import survival_events_from_table

    kmf = KaplanMeierFitter()
    ax = plt.subplot(111)

    #df = pd.read_csv('/Users/MDONEGAN/Downloads/counts.csv')
    df = df.set_index('time')

    T, E, W = survival_events_from_table(df,
                                         observed_deaths_col='death',
                                         censored_col='censored')

    kmf.fit(T, E, weights=W)

    kmf.plot(ax=ax, ci_show=True, marker='o')
    plt.xlabel("days")
    plt.ylabel("survival %")
    plt.ylim(0.4, 1.05)

    #%%
    #trying to combine the grouping function and the events from table function

    from lifelines import KaplanMeierFitter
    from lifelines.utils import survival_events_from_table

    kmf = KaplanMeierFitter()
    ax = plt.subplot(111)

    #df = pd.read_csv('/Users/MDONEGAN/Downloads/counts.csv')
    df = df.set_index('time')

    T, E, W = survival_events_from_table(df,
                                         observed_deaths_col='death',
                                         censored_col='censored')

    print(E)

    #group dataset by treatment and plot all groups (treatments) using kmf fit
    for name, T_group, E_group, W_group in T, E, W.groupby('group'):
        kmf.fit(grouped_survival['T'], grouped_survival['E'], label=name)
        kmf.plot(ax=ax, ci_show=False, marker='o')
        plt.xlabel("days")
        plt.ylabel("survival %")
        plt.ylim(0.4, 1.05)

    return fig_to_uri(plt)
Esempio n. 5
0
def show_survival_curve(df,
                        t_col,
                        y_col,
                        max_time=None,
                        weight=None,
                        save_file=None):
    plt.figure(figsize=(8, 6))
    plt.rcParams["font.size"] = 14
    colors = ['blue', 'red', 'magenta']

    tr_uniq = np.sort(df[t_col].astype(int).unique())
    max_time = df[y_col].max() if max_time is None else max_time
    time = df[y_col].values
    event = np.where(df[y_col] < max_time, 1, 0)
    verbose_days = [
        0,
        int((max_time - 1) / 3),
        int((max_time - 1) * 2 / 3),
        int(max_time) - 1
    ]

    for d in verbose_days:
        plt.text(d,
                 0.6,
                 f'RR({d}day)',
                 horizontalalignment='center',
                 verticalalignment='center')

    curve_list = []
    elapsed_days = np.array([i for i in range(int(max_time))])
    kmf = KaplanMeierFitter()
    for i, tr in enumerate(tr_uniq):
        t_idx = (df[t_col] == tr)
        if weight is None:
            kmf.fit(time[t_idx], event[t_idx], label=f'tr={tr}')
        else:
            kmf.fit(time[t_idx],
                    event[t_idx],
                    label=f'tr={tr}',
                    weights=weight[t_idx])
        curve_list.append(kmf.survival_function_at_times(elapsed_days))
        ax = kmf.plot(c=colors[i])
        for d in verbose_days:
            surv_prob = kmf.survival_function_at_times(d).values[0]
            ax = plt.scatter(d, surv_prob, marker='o', c=colors[i])
            ax = plt.text(d,
                          0.6 - 0.02 * (i + 1),
                          f'{surv_prob :.3f}',
                          c=colors[i],
                          horizontalalignment='center',
                          verticalalignment='center')

    plt.xlim(-3, int(max_time) + 3)
    plt.ylim(0.5, 1.05)
    plt.xlabel('Followed days (elapsed days)')
    plt.ylabel('Survival probability (retention rate)')
    plt.legend(loc='best')
    plt.grid()
    plt.tight_layout()
    if save_file is not None:
        plt.savefig(save_file)
    plt.show()

    return (np.array(curve_list[1]) - np.array(curve_list[0])).reshape(-1)
Esempio n. 6
0
def categorical_km_curves(feature, t='Tenure', event='Churn', df=df, ax=None):
    for cat in sorted(df[feature].unique(), reverse=True):
        idx = df[feature] == cat
        kmf = KaplanMeierFitter()
        kmf.fit(df[idx][t], event_observed=df[idx][event], label=cat)
        kmf.plot(ax=ax, label=cat, ci_show=True, c=colours[cat])
    def fit(
        self,
        durations,
        event_observed,
        event_of_interest,
        timeline=None,
        entry=None,
        label="AJ_estimate",
        alpha=None,
        ci_labels=None,
        weights=None,
    ):  # pylint: disable=too-many-arguments,too-many-locals
        """
        Parameters
        ----------
          durations: an array or pd.Series of length n -- duration of subject was observed for
          event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be
             only positive integers, where 0 indicates censoring.
          event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events
             Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2)
             is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer
          timeline: return the best estimate at the values in timelines (positively increasing)
          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
             useful for left-truncated (not left-censored) observations. If None, all members of the population
             were born at time 0.
          label: a string to name the column of the estimate.
          alpha: the alpha value in the confidence intervals. Overrides the initializing
             alpha for this call to fit only.
          ci_labels: add custom column names to the generated confidence intervals
                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2>
          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
              of providing every subject as a single element of `durations` and `event_observed`, one could
              weigh subject differently.

        Returns
        -------
        self : AalenJohansenFitter
          self, with new properties like ``cumulative_incidence_``.
        """
        # Checking for tied event times
        ties = self._check_for_duplicates(durations=durations,
                                          events=event_observed)

        if ties:
            warnings.warn(
                dedent(
                    """Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times.
                To resolve ties, data is randomly jittered."""),
                Warning,
            )
            durations = self._jitter(
                durations=pd.Series(durations),
                event=pd.Series(event_observed),
                jitter_level=self._jitter_level,
                seed=self._seed,
            )

        alpha = alpha if alpha else self.alpha

        # Creating label for event of interest & indicator for that event
        event_of_interest = int(event_of_interest)
        cmprisk_label = "CIF_" + str(event_of_interest)
        self.label_cmprisk = "observed_" + str(event_of_interest)

        # Fitting Kaplan-Meier for either event of interest OR competing risk
        km = KaplanMeierFitter().fit(durations,
                                     event_observed=event_observed,
                                     timeline=timeline,
                                     entry=entry,
                                     weights=weights)
        aj = km.event_table
        aj["overall_survival"] = km.survival_function_
        aj["lagged_overall_survival"] = aj["overall_survival"].shift()

        # Setting up table for calculations and to return to user
        event_spec = pd.Series(event_observed) == event_of_interest
        self.durations, self.event_observed, *_, event_table, weights = _preprocess_inputs(
            durations=durations,
            event_observed=event_spec,
            timeline=timeline,
            entry=entry,
            weights=weights)
        event_spec_times = event_table["observed"]
        event_spec_times = event_spec_times.rename(self.label_cmprisk)
        aj = pd.concat([aj, event_spec_times], axis=1).reset_index()

        # Estimator of Cumulative Incidence (Density) Function
        aj[cmprisk_label] = (aj[self.label_cmprisk] / aj["at_risk"] *
                             aj["lagged_overall_survival"]).cumsum()
        aj.loc[0, cmprisk_label] = 0  # Setting initial CIF to be zero
        aj = aj.set_index("event_at")

        # Setting attributes
        self._estimation_method = "cumulative_density_"
        self._estimate_name = "cumulative_density_"
        self.timeline = km.timeline
        self._update_docstrings()

        self._label = label
        self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label])

        # Technically, cumulative incidence, but consistent with KaplanMeierFitter
        self.event_table = aj[[
            "removed", "observed", self.label_cmprisk, "censored", "entrance",
            "at_risk"
        ]]  # Event table

        if self._calc_var:
            self.variance_, self.confidence_interval_ = self._bounds(
                aj["lagged_overall_survival"],
                alpha=alpha,
                ci_labels=ci_labels)
        else:
            self.variance_, self.confidence_interval_ = None, None

        self.confidence_interval_cumulative_density_ = self.confidence_interval_
        return self
MIN_2 = np.percentile(T_actual, 30)
MIN_3 = np.percentile(T_actual, 50)

T = T_actual.copy()
ix = np.random.randint(4, size=N)

T = np.where(ix == 0, np.maximum(T, MIN_0), T)
T = np.where(ix == 1, np.maximum(T, MIN_1), T)
T = np.where(ix == 2, np.maximum(T, MIN_2), T)
T = np.where(ix == 3, np.maximum(T, MIN_3), T)
E = T_actual == T

fig, axes = plt.subplots(2, 2, figsize=(9, 5))
axes = axes.reshape(4)

for i, model in enumerate([WeibullFitter(), KaplanMeierFitter(), LogNormalFitter(), LogLogisticFitter()]):
    if isinstance(model, KaplanMeierFitter):
        model.fit(T, E, left_censorship=True, label=model.__class__.__name__)
    else:
        model.fit(T, E, left_censorship=True, label=model.__class__.__name__)

    model.plot_cumulative_density(ax=axes[i])
plt.tight_layout()

for i, model in enumerate([WeibullFitter(), LogNormalFitter(), LogLogisticFitter()]):
    model.fit(T, E, left_censorship=True)
    fig, axes = plt.subplots(2, 1, figsize=(8, 6))

    left_censorship_cdf_plot(model, ax=axes[0])
    qq_plot(model, ax=axes[1])
Esempio n. 9
0
def index_of_survival(request: HttpRequest, all_parameter: str):
    """
    response = {
        data = 
    }
    """
    mm = all_parameter.split("&")

    st = mm[0].split("=")[1]
    if ("," in mm[1].split("=")[1]):
        ct = mm[1].split("=")[1].split(",")
    else:
        ct = [mm[1].split("=")[1]]

    b = API.DatabaseAPI("tcga")
    my_dict_b = b.query_collection_obs()
    my_df_b = pd.DataFrame(my_dict_b)
    select_part = my_df_b.loc[my_df_b["primary_disease"].isin(ct), :]

    if len(ct) > 8:
        response = {
            "error":
            "Too many datasets. You can select no more than eight datasets."
        }
        return JsonResponse(response)

    ref = mm[5].split("=")[1]
    if ("," in mm[2].split("=")[1]):
        cell = mm[2].split("=")[1].split(",")
    else:
        cell = [mm[2].split("=")[1]]
    up = mm[3].split("=")[1]
    dn = mm[4].split("=")[1]

    select = select_part["primary_disease"].tolist()
    if ref == "EPIC":
        columns_list = ["EPIC_cellFractions." + i for i in cell]
        ref = API.DatabaseAPI("ref")
    elif ref == "LM":
        columns_list = ["LM_" + i for i in cell]
        ref = API.DatabaseAPI("LM_ref")
    elif ref == "QS":
        columns_list = ["QS_" + i for i in cell]
        ref = API.DatabaseAPI("QS_ref")
    else:
        response = {"error": "reference error"}
        return JsonResponse(response)
    cellID = select_part["cellID"].tolist()
    my_df_d = select_part.loc[:, columns_list]
    my_df_d.index = cellID
    my_df_t = my_df_d.T

    genes = ref.query_collection_var()["geneSymbol"]

    gg = ref.query_collection_gene_X_var_by_obs(genes)
    gg = pd.DataFrame(gg)

    gg.columns = ref.query_collection_obs()["celltype"]
    gg_mean = pd.DataFrame(gg.T.mean(axis=1))
    gg_mean = gg_mean.loc[cell, :]
    expression = my_df_t.multiply(gg_mean.values)
    expression_t = expression.T
    expression_t = pd.DataFrame(expression_t.sum(axis=1), columns=["sum"])
    expression_t = expression_t.sort_values(by=["sum"], ascending=False)
    number = expression_t.shape[0]
    number1 = int(number / 100 * (100 - int(up)))
    number2 = int(number / 100 * (100 - int(dn)))
    samples = expression_t.index.tolist()
    sample = []
    for each in samples:
        names = each.split(".")
        sample.append(names[0] + "." + names[1] + "." + names[2])

    up_sample = sample[:number1]
    dn_sample = sample[number2 + 1:]

    matches = {"Dead": 1, "Alive": 0, "-": 0}
    a = API.DatabaseAPI("survival")

    #up part
    my_dict_a = a.query_collection_obs()
    my_df_a = pd.DataFrame(my_dict_a)
    my_df_a = my_df_a.loc[my_df_a["sample"].isin(up_sample), :]
    OSEVENT = my_df_a["OSEVENT"].tolist()
    E = [matches[i] for i in OSEVENT]
    if st == "OS":
        T = my_df_a["OSDAY"].tolist()
    else:
        T = my_df_a["RFSDAY"].tolist()

    E_end_up = [E[i] for i in range(len(T)) if T[i] != "-"]
    T_end = [T[i] for i in range(len(T)) if T[i] != "-"]
    T_end = list(map(float, T_end))
    T_end_up = list(map(lambda x: round(x / 30, 2), T_end))
    kmf = KaplanMeierFitter()
    kmf.fit(T_end_up, E_end_up)

    sf = kmf.survival_function_.T
    xa = sf.columns.tolist()
    y1a = list(map(lambda x: round(x, 3), sf.values[0].tolist()))
    ci = kmf.confidence_interval_survival_function_.T.values
    y2a = list(map(lambda x: round(x, 3), ci[1].tolist()))
    y3a = list(map(lambda x: round(x, 3), ci[0].tolist()))
    xca = [T_end_up[i] for i in range(len(T_end_up)) if E_end_up[i] == 0]
    xca = list(map(float, xca))
    yca = list(
        map(lambda x: round(x, 3),
            kmf.survival_function_at_times(xca).tolist()))

    #dn part
    my_dict_a = a.query_collection_obs()
    my_df_a = pd.DataFrame(my_dict_a)
    my_df_a = my_df_a.loc[my_df_a["sample"].isin(dn_sample), :]

    OSEVENT = my_df_a["OSEVENT"].tolist()
    E = [matches[i] for i in OSEVENT]
    if st == "OS":
        T = my_df_a["OSDAY"].tolist()
    else:
        T = my_df_a["RFSDAY"].tolist()

    E_end_dn = [E[i] for i in range(len(T)) if T[i] != "-"]
    T_end = [T[i] for i in range(len(T)) if T[i] != "-"]
    T_end = list(map(float, T_end))
    T_end_dn = list(map(lambda x: round(x / 30, 2), T_end))
    kmf = KaplanMeierFitter()
    kmf.fit(T_end_dn, E_end_dn)

    sf = kmf.survival_function_.T
    xb = sf.columns.tolist()
    y1b = list(map(lambda x: round(x, 3), sf.values[0].tolist()))
    ci = kmf.confidence_interval_survival_function_.T.values
    y2b = list(map(lambda x: round(x, 3), ci[1].tolist()))
    y3b = list(map(lambda x: round(x, 3), ci[0].tolist()))
    xcb = [T_end_dn[i] for i in range(len(T_end_dn)) if E_end_dn[i] == 0]
    xcb = list(map(float, xcb))
    ycb = list(
        map(lambda x: round(x, 3),
            kmf.survival_function_at_times(xcb).tolist()))

    results = logrank_test(T_end_up,
                           T_end_dn,
                           event_observed_A=E_end_up,
                           event_observed_B=E_end_dn)
    pValues1 = float(results.summary["p"].values)

    dfA = pd.DataFrame({'E': E_end_up, 'T': T_end_up, 'groupA': 1})
    dfB = pd.DataFrame({'E': E_end_dn, 'T': T_end_dn, 'groupA': 0})
    df = pd.concat([dfA, dfB])
    cph = CoxPHFitter().fit(df, 'T', 'E')
    pValues2 = float(cph.summary["p"].values)

    response = {
        "data": [{
            "pValues1": pValues1,
            "pValues2": pValues2
        }, {
            "line": {
                "dash": "solid",
                "color": "red",
                "shape": "hv",
                "width": 2
            },
            "mode": "lines",
            "name": "",
            "type": "scatter",
            "x": xa,
            "y": y1a,
            "xaxis": "x1",
            "yaxis": "y1",
            "showlegend": False
        }, {
            "line": {
                "dash": "dash",
                "color": "red",
                "shape": "hv",
                "width": 2
            },
            "mode": "lines",
            "name": "",
            "type": "scatter",
            "x": xa,
            "y": y2a,
            "xaxis": "x1",
            "yaxis": "y1",
            "showlegend": False
        }, {
            "line": {
                "dash": "dash",
                "color": "red",
                "shape": "hv",
                "width": 2
            },
            "mode": "lines",
            "name": "",
            "type": "scatter",
            "x": xa,
            "y": y3a,
            "xaxis": "x1",
            "yaxis": "y1",
            "showlegend": False
        }, {
            "mode": "markers",
            "name": "",
            "text": "",
            "type": "scatter",
            "x": xca,
            "y": yca,
            "xaxis": "x1",
            "yaxis": "y1",
            "marker": {
                "size": 10,
                "color": "black",
                "symbol": "cross-thin-open",
                "opacity": 1,
                "sizeref": 1,
                "sizemode": "area"
            },
            "showlegend": False
        }, {
            "line": {
                "dash": "solid",
                "color": "blue",
                "shape": "hv",
                "width": 2
            },
            "mode": "lines",
            "name": "",
            "type": "scatter",
            "x": xb,
            "y": y1b,
            "xaxis": "x1",
            "yaxis": "y1",
            "showlegend": False
        }, {
            "line": {
                "dash": "dash",
                "color": "blue",
                "shape": "hv",
                "width": 2
            },
            "mode": "lines",
            "name": "",
            "type": "scatter",
            "x": xb,
            "y": y2b,
            "xaxis": "x1",
            "yaxis": "y1",
            "showlegend": False
        }, {
            "line": {
                "dash": "dash",
                "color": "blue",
                "shape": "hv",
                "width": 2
            },
            "mode": "lines",
            "name": "",
            "type": "scatter",
            "x": xb,
            "y": y3b,
            "xaxis": "x1",
            "yaxis": "y1",
            "showlegend": False
        }, {
            "mode": "markers",
            "name": "",
            "text": "",
            "type": "scatter",
            "x": xcb,
            "y": ycb,
            "xaxis": "x1",
            "yaxis": "y1",
            "marker": {
                "size": 10,
                "color": "black",
                "symbol": "cross-thin-open",
                "opacity": 1,
                "sizeref": 1,
                "sizemode": "area"
            },
            "showlegend": False
        }]
    }
    return JsonResponse(response)
def main(data_df):

    for key in th_dict.keys():
        if not key.find("hu") >0:
            data_df[key] = data_df[key].fillna(0)
        data_df[key] = data_df[key].map(lambda input:1 if input>=th_dict[key] else 0 )


    add_DF = pd.DataFrame()
    add_DF["V-HU"]=data_df['HU_of_consolidation']+data_df['Volume_of_total_pneumonia_infection'] #0,1,2

    all_data = pd.concat([
                        data_df["Duration"],
                        data_df["Death"] ,
                        add_DF["V-HU"],
                        ],axis=1)


    kmf = KaplanMeierFitter()
    T = all_data["Duration"]

    death = all_data['Death']
    key_word = "V-HU"

    risk_level_0 = all_data[key_word] == 0
    risk_level_1 = all_data[key_word] == 1
    risk_level_2 = all_data[key_word] == 2

    kmf.fit(T[risk_level_0], event_observed=death[risk_level_0],  label='low risk')
    ax = kmf.plot()
    kmf.fit(T[risk_level_1], event_observed=death[risk_level_1],  label='intermediate risk')
    ax = kmf.plot()

    kmf.fit(T[risk_level_2], event_observed=death[risk_level_2], label='high risk')
    kmf.plot(ax=ax)
    plt.legend(fontsize=7,loc='lower left')
    #kmf.plot()
    plt.ylabel('Survival Probability')
    plt.xlabel('Time since admission to death(days)')

    plt.text(37, 1, "Hazard ratio:",fontsize=8,style='italic')
    plt.text(37, 0.96, "low risk: reference",fontsize=8)
    plt.text(37, 0.92, "intermediate risk: 2,54; 95%CI, 1,44-4,49",fontsize=8)
    plt.text(37, 0.88, "high risk: 4,90; 95%CI, 2,78-8,64",fontsize=8)
    plt.text(12, 1, "p-value < 0,0001",fontsize=8,fontstyle='italic')

    #all data
    low_list = ['69','61','56','53','53','53','53','53']
    medium_list = ['100','69','58','55','53','53','53','53']
    high_list = ['69','37','27','26','21','21','20','20']

    plt.text(-30, 0.005,  "Numbers at low risk",fontsize=8)
    for i in range(len(low_list)):
        plt.text((i*10)-1, 0,low_list[i],fontsize=8)
    plt.text(-30, -0.035, "Numbers at intermediate risk",fontsize=8)
    for i in range(len(low_list)):
        plt.text((i*10)-1, -0.04,medium_list[i],fontsize=8)
    plt.text(-30, -0.075, "Numbers at high risk",fontsize=8)
    for i in range(len(low_list)):
        if len(high_list[i])==1:
            plt.text((i*10), -0.08,high_list[i],fontsize=8)
        else:
            plt.text((i*10)-1, -0.08,high_list[i],fontsize=8)

    plt.savefig("km_alldata_V-HU.pdf", bbox_inches='tight')
Esempio n. 11
0
def mc_gformula_check():
    df = load_sample_data(timevary=True)
    df['lag_art'] = df['art'].shift(1)
    df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art'])
    df['lag_cd4'] = df['cd4'].shift(1)
    df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4'])
    df['lag_dvl'] = df['dvl'].shift(1)
    df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl'])
    df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True)  # age spline
    df['cd40_sq'] = df['cd40'] ** 2  # cd4 baseline cubic
    df['cd40_cu'] = df['cd40'] ** 3
    df['cd4_sq'] = df['cd4'] ** 2  # cd4 current cubic
    df['cd4_cu'] = df['cd4'] ** 3
    df['enter_sq'] = df['enter'] ** 2  # entry time cubic
    df['enter_cu'] = df['enter'] ** 3
    g = MonteCarloGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out')
    exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + 
            cd4_cu + dvl + enter + enter_sq + enter_cu'''
    g.exposure_model(exp_m, restriction="g['lag_art']==0")
    out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + 
            cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu'''
    g.outcome_model(out_m, restriction="g['drop']==0")
    dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
            lag_dvl + lag_art + enter + enter_sq + enter_cu'''
    g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary')
    cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 +  cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + 
            lag_dvl + lag_art + enter + enter_sq + enter_cu'''
    cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);"
                         "g['cd4_sq'] = g['cd4']**2;"
                         "g['cd4_cu'] = g['cd4']**3")
    g.add_covariate_model(label=2, covariate='cd4', model=cd4_m,recode=cd4_recode_scheme, var_type='continuous')
    g.fit(treatment="((g['art']==1) | (g['lag_art']==1))",
          lags={'art': 'lag_art',
                'cd4': 'lag_cd4',
                'dvl': 'lag_dvl'},
          sample=10000, t_max=None,
          in_recode=("g['enter_sq'] = g['enter']**2;"
                     "g['enter_cu'] = g['enter']**3"))
    gf = g.predicted_outcomes
    kmn = KaplanMeierFitter()
    kmn.fit(durations=gf['out'], event_observed=gf['dead'])
    kmo = KaplanMeierFitter()
    kmo.fit(durations=df['out'], event_observed=df['dead'], entry=df['enter'])
    cens_m = """male + age0 + age_rs0 + age_rs1 + age_rs2 +  cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 +
             lag_dvl + lag_art + enter + enter_sq + enter_cu"""
    g.censoring_model(cens_m)
    g.fit(treatment="((g['art']==1) | (g['lag_art']==1))",
          lags={'art': 'lag_art',
                'cd4': 'lag_cd4',
                'dvl': 'lag_dvl'},
          sample=10000, t_max=None,
          in_recode=("g['enter_sq'] = g['enter']**2;"
                     "g['enter_cu'] = g['enter']**3"))
    gf = g.predicted_outcomes
    kmc = KaplanMeierFitter()
    kmc.fit(durations=gf['out'], event_observed=gf['dead'])
    plt.step(kmn.event_table.index, 1 - kmn.survival_function_, c='g', where='post', label='Natural')
    plt.step(kmn.event_table.index, 1 - kmc.survival_function_, c='orange', where='post', label='Censor')
    plt.step(kmo.event_table.index, 1 - kmo.survival_function_, c='k', where='post', label='True')
    plt.legend()
    plt.show()
Esempio n. 12
0
 def kmf(self):
     return KaplanMeierFitter()
Esempio n. 13
0
 def compute_cens_surv(self):
     # inverse KMF
     kmf = KaplanMeierFitter()
     kmf.fit(self.df['TIME'], event_observed=(1-self.df['EVENT']))
     self.cens_surv = kmf.survival_function_.rename_axis('TIME').rename(columns={'KM_estimate': 'CENS_SURV'})
     self.df = self.df.merge(self.cens_surv.reset_index(), how='left', on='TIME')
Esempio n. 14
0
def kaplan_meier(
        file,
        model=None,
        cohorts=["UKDP"],
        event_type="biochemicalRecurrence",
        event_time="bcrTime",
        figsize=(9, 6),
):
    if isinstance(cohorts, str):
        cohorts = [cohorts]
    if model is None:
        md = mc_model()
    else:
        md = model

    valid = np.logical_and(
        ~md.pheno.loc[:, [event_type, event_time]].isna().any(axis=1),
        md.pheno["blacklisted"] == 0,
    )
    chs = pd.Series(cohorts).str.upper()
    ind = np.logical_and(md.pheno["CohortAbb"].str.upper().isin(chs), valid)
    mpheno = md.pheno.loc[ind, :].copy()

    if file.endswith(".tsv"):
        # this is a score file
        score_df = pd.read_csv(file, delimiter="\t", index_col="ID")
        score = score_df.loc[mpheno.index, "score"]
        ind[ind] = ~score.isna()
        score = score[~score.isna()].values
        mpheno = md.pheno.loc[ind, :].copy()
    else:
        pars = get_params(file)
        if "logHR" not in pars["means"]:
            raise TypeError(
                "The parameter in the file do not seem to contain hazard "
                "prediction.")
        expressions = np.concatenate(
            [pars["means"]["x_t"][ind, :], pars["means"]["x_f"][ind, :]],
            axis=1,
        )
        score = np.dot(expressions, pars["means"]["logHR"])[:, 0]

    event = md.pheno.loc[ind, event_type].values
    time = md.pheno.loc[ind, event_time].values / 365.25

    # Grouping
    threshold = np.median(score)
    grouping = score > threshold
    g1 = grouping
    g2 = ~grouping

    # Kaplan Mayer Plot
    kmfh = KaplanMeierFitter()
    kmfh.fit(time[g1], event[g1], label="High Hazard")
    figure = kmfh.plot(figsize=figsize)
    kmfl = KaplanMeierFitter()
    kmfl.fit(time[g2], event[g2], label="Low Hazard")
    figure = kmfl.plot(ax=figure)
    plt.xlabel("years")
    add_at_risk_counts(kmfh, kmfl, ax=figure)

    # Cox Regression
    mpheno["score"] = score
    cph = CoxPHFitter()
    cph.fit(mpheno,
            duration_col=event_time,
            event_col=event_type,
            formula="score")

    # logrank test
    logr = statistics.logrank_test(
        mpheno.loc[g1, event_time],
        mpheno.loc[g2, event_time],
        mpheno.loc[g1, event_type],
        mpheno.loc[g2, event_type],
    )

    print("Cohorts: {}, event: {}, time: {}".format(cohorts, event_type,
                                                    event_time))
    print("Concordance: {:.2%}".format(cph.concordance_index_))
    print("Cox p-value: {}".format(cph.summary.loc["score", "p"]))
    print("Logrank p-value: {}".format(logr.p_value))

    return figure, cph, logr
Esempio n. 15
0
"""
Created on Mon May 18 20:32:10 2020
@author: DESHMUKH
SURVIVAL ANALYSIS
"""
# pip install lifelines
import pandas as pd
from lifelines import KaplanMeierFitter

# ==========================================================================================
# Business Problem - Perform Kaplan meir analysis for the given data and get the life table.
# ==========================================================================================

patient = pd.read_csv('Patient.csv')
patient.head()
patient.info()

# Summary
patient.describe()

# Initiating the KaplanMeierFitter model
kmf = KaplanMeierFitter(label='FollowUps vs Event')

# Fitting KaplanMeierFitter model on Followups and Event type
kmf.fit(patient.Followup, patient.Eventtype)  # fit(time,events)

# Time-line estimations plot
kmf.plot(color='g')

# ---------------------------------------------------- #
data = pd.DataFrame(data)
duration = data['span']
observed = data.ix[:, 'censor']

#kmf = KaplanMeierFitter()
#kmf.fit(duration,observed,label='kmf_mean')
#kmf.plot()
#plt.show()

##atleast 50 innings playe

data['runs'] = pd.to_numeric(data['runs'])

runs8000 = data.ix[data['runs'] >= 8000]
runs3000 = data.ix[data['runs'] <= 3000]
#runs3000 = runs3000.ix[runs3000['runs']< 4000]

kmfruns8000 = KaplanMeierFitter()
kmfruns8000.fit(runs8000['span'], runs8000['censor'], label=' runs > 8000')

kmfruns3000 = KaplanMeierFitter()
kmfruns3000.fit(runs3000['span'], runs3000['censor'], label=' runs < 3000')

bx = plt.subplot(111)
kmfruns8000.survival_function_.plot(ax=bx)
kmfruns3000.survival_function_.plot(ax=bx)

plt.xlabel(" career length ( in years )")
plt.ylabel(" probability of players  ")
plt.title("probability of players with specific runs vs their career length")
plt.show()
Esempio n. 17
0
def test_qth_survival_time_accepts_a_model():
    kmf = KaplanMeierFitter().fit([1.0, 0.7, 0.6])
    assert utils.qth_survival_time(0.8, kmf) > 0
    def estimate_kaplan_meier(self):
        labels = self.survival_label[
            'label']  # 将data_label的DataFrame格式转化为Series格式
        sfs = {}
        # 画生存曲线图
        # plt.figure(1)
        ax = plt.subplot()
        fitter = []

        for label in sorted(labels.unique()):
            data_label_index = list(
                set(labels[labels == label].index)
                & set(self.survival_label.index))
            kmf = KaplanMeierFitter()
            kmf.fit(self.survival_label.loc[data_label_index][
                self.duration_column],
                    self.survival_label.loc[data_label_index][
                        self.observed_column],
                    label=label)
            # 将每一个训练的kmf放入fitter中存储,用于画出每个标签的对应的时间的生存人数
            fitter.append(kmf)

            sfs[label] = kmf.survival_function_  # 得到每个标签的生存率
            self.median_survival_time[label] = kmf.median_

            ax = kmf.plot(ax=ax)  # 画生存曲线图

        # 画对应时间的生存人数
        add_at_risk_counts(*fitter)
        # 计算log_rank值看分组的生存差异是否显著
        self.test_statistic, self.p_value = multivariate_logrank_test(
            self.survival_label, labels)
        if self.p_value == 0:
            self.p_value = '< 0.0001'
            p_transform = True
        else:
            self.p_value = str(self.p_value)
            p_transform = False
        # 输出所有组的生存率
        self.survival_rate_result = pd.concat(
            [sfs[k] for k in list(sorted(labels.unique()))],
            axis=1).interpolate()
        if len(self.CI) > 0:
            # 在图中显示log_rank中p值
            if p_transform == False:
                ax.text(0.35,
                        0.8,
                        'log_rank p=%s' % self.p_value,
                        transform=ax.transAxes,
                        va='top',
                        fontsize=12)
                ax.text(0.35,
                        0.9,
                        "HR=%.3f(95%% CI:%.3f-%.3f)" %
                        (self.HR, self.CI[0], self.CI[1]),
                        transform=ax.transAxes,
                        va='top',
                        fontsize=12)
            else:
                ax.text(0.35,
                        0.8,
                        'log_rank p %s' % self.p_value,
                        transform=ax.transAxes,
                        va='top',
                        fontsize=12)
                ax.text(0.35,
                        0.9,
                        "HR=%.3f(95%% CI:%.3f-%.3f)" %
                        (self.HR, self.CI[0], self.CI[1]),
                        transform=ax.transAxes,
                        va='top',
                        fontsize=12)
        else:
            # 在图中显示log_rank中p值
            ax.text(0.35,
                    0.8,
                    'log_rank p=%s' % self.p_value,
                    transform=ax.transAxes,
                    va='top',
                    fontsize=12)
        plt.title('Full Data')
        print("Median survival time of data: %s" % self.median_survival_time)
        plt.show()
        dict(selector="td", props=[('padding', "0em 0em")]),
        dict(selector="th:hover", props=[("font-size", "12pt")]),
        dict(selector="tr:hover td:hover",
             props=[('max-width', '200px'), ('font-size', '12pt')])
    ]

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())
plt.show()
"""
#Fitting the model using KaplanMeiler
"""
kmf = KaplanMeierFitter()
kmf.fit(durations=data['Duration'], event_observed=data['Divorce'])
#Plotting survival function
kmf.survival_function_.plot(title='Marriage Survival Time in the U.S',
                            legend=False,
                            linewidth=3.0)
plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/Survival.pdf')
plt.show()
kmf.plot(
    title='Survival Time Estimates of Mariages and its Confidence Intervals',
    legend=False,
    linewidth=3.0,
    show_censors=True)
#Export the figure
plt.savefig(
    '/home/raed/Dropbox/INSE - 6320/Final Project/Survival_ConfidenceInterval.pdf'
Esempio n. 20
0
def km_curve(labels_ids,
             survival_dataset,
             tested_gene_expression_headers_columns,
             gene_group,
             k=None,
             label_index=None):
    # ax = plt.subplot(111)
    flatten_set = set(y for x in labels_ids for y in x)
    dif = set(tested_gene_expression_headers_columns).difference(flatten_set)
    if len(dif) > 0:
        labels_ids.append(list(dif))
    kmf = KaplanMeierFitter()
    all_labels = np.array([y for x in labels_ids for y in x])
    label_event_list = []
    label_duration_list = []
    lr_results_global = None
    for i, cur_labels in enumerate(labels_ids):
        label_event = survival_dataset[
            np.in1d(survival_dataset[:, 0], cur_labels) & np.
            in1d(survival_dataset[:,
                                  0], tested_gene_expression_headers_columns),
            4].astype(np.int32)
        label_duration = survival_dataset[
            np.in1d(survival_dataset[:, 0], cur_labels) & np.
            in1d(survival_dataset[:,
                                  0], tested_gene_expression_headers_columns),
            3].astype(np.int32)
        label_event_list.append(label_event)
        label_duration_list.append(label_duration)
        labels_c = all_labels[
            ~np.in1d(all_labels, cur_labels)
            & np.in1d(all_labels, tested_gene_expression_headers_columns)]
        label_event_c = survival_dataset[
            np.in1d(survival_dataset[:, 0], labels_c), 4].astype(np.int32)
        label_duration_c = survival_dataset[
            np.in1d(survival_dataset[:, 0], labels_c), 3].astype(np.int32)
        # print labels_ids
        # print survival_dataset
        # print "{}_{}_{}_{}".format(len(label_duration),len(label_duration_c),len(label_event),len(label_event_c))
        lr_results_global = logrank_test(label_duration,
                                         label_duration_c,
                                         label_event,
                                         label_event_c,
                                         alpha=.95).p_value
        if len(label_duration) != 0:
            kmf.fit(list(label_duration),
                    event_observed=list(label_event),
                    label="cluster {} n={}, logrank pval = {}".format(
                        i, len(label_duration),
                        '{0:1.3e}'.format(lr_results_global)))  # '%.7f' %
            # kmf.plot(ax=ax, show_censors=True)
            print "lrank cluster {} vs all: {}".format(i, lr_results_global)
            for j, cur_duration in enumerate(label_duration_list[:-1]):
                lr_results = logrank_test(label_duration,
                                          label_duration_list[j],
                                          label_event,
                                          label_event_list[j],
                                          alpha=.95).p_value
                print "lrank cluster {} vs cluster {}: {}".format(
                    i, j, lr_results)
    # plt.ylim(0, 1);

    # plt.title("clustering survival analysis");
    # plt.savefig(os.path.join(constants.BASE_PROFILE,"output" ,"cluster_by_p_{}_{}_k={}_label_i={}_{}.png".format(constants.CANCER_TYPE, gene_group,k,label_index , time.time())))
    # plt.cla()
    return lr_results_global
Esempio n. 21
0
    med_50k = (dataset["Median household income inflation adj to 2018"] ==
               "$50,000 - $54,999")
    med_45k = (dataset["Median household income inflation adj to 2018"] ==
               "$45,000 - $49,999")
    med_40k = (dataset["Median household income inflation adj to 2018"] ==
               "$40,000 - $44,999")
    med_35k = (dataset["Median household income inflation adj to 2018"] ==
               "$35,000 - $39,999")
    med_35k_minus = (dataset["Median household income inflation adj to 2018"]
                     == "< $35,000")
    obs = dataset["SEER cause-specific death classification"]
    lb = LabelBinarizer()
    obs = lb.fit_transform(obs)
    durations = dataset['Survival months']

    kmf1 = KaplanMeierFitter()
    kmf1.fit(durations[med_75k_plus],
             event_observed=obs[med_75k_plus],
             label="75,000+")
    kmf1.plot(ax=ax)

    kmf2 = KaplanMeierFitter()
    kmf2.fit(durations[med_50k],
             event_observed=obs[med_50k],
             label="50,000-55,000")
    kmf2.plot(ax=ax)

    kmf3 = KaplanMeierFitter()
    kmf3.fit(durations[med_35k_minus],
             event_observed=obs[med_35k_minus],
             label="<35,000")
Esempio n. 22
0
merged = data.set_index('company').join(companies.set_index('company'))

data_not_empty = merged.copy()
data_not_empty = data_not_empty.dropna()

countries = ['NA', 'SA', 'AF', 'OC', 'EU', 'ME']

fig = plt.figure(figsize=(10, 12))
plt.subplots_adjust(hspace=0.4)

for i, continent in enumerate(countries):
    continent_data = data_not_empty[data_not_empty['continent'] == continent]
    other_data = data_not_empty[data_not_empty['continent'] != continent]

    kmf_continent = KaplanMeierFitter()
    kmf_continent.fit(continent_data['duration'], continent_data['observed'])

    kmf_other = KaplanMeierFitter()
    kmf_other.fit(other_data['duration'], other_data['observed'])

    ax = fig.add_subplot(3, 2, i + 1)
    ax.set_title('{} vs other'.format(continent))
    kmf_continent.plot_loglogs(ax=ax, label=continent)
    kmf_other.plot_loglogs(ax=ax, label='other')

fig.show()

data_financial = merged[merged['branch'] == 'F']
data_other = merged[merged['branch'] == 'O']
Esempio n. 23
0
def plot2(df):
    survival = df
    kmf = KaplanMeierFitter()

    ax = plt.subplot(111)

    stacks = ['water', 'Val1000']

    stacks_graph = survival.loc[survival['group'].isin(stacks)]

    for name, grouped_survival in stacks_graph.groupby('group'):
        kmf.fit(grouped_survival['time'],
                grouped_survival['event'],
                label=name)
        kmf.plot(ax=ax, ci_show=False, marker='o')
    plt.xlabel("days")
    plt.ylabel("survival proportion")
    plt.ylim(-0.05, 1.05)

    #%%
    # subset the dataset into Val50 treatments and water and plot

    kmf = KaplanMeierFitter()

    ax = plt.subplot(111)

    val = ['water', 'Ryan']

    val_graph = survival.loc[survival['group'].isin(val)]

    for name, grouped_survival in val_graph.groupby('group'):
        kmf.fit(grouped_survival['time'],
                grouped_survival['event'],
                label=name)
        kmf.plot(ax=ax, ci_show=False, marker='o')
    plt.xlabel("days")
    plt.ylabel("survival proportion")
    plt.ylim(-0.05, 1.05)

    #%%
    # subset the dataset by pumice formulations and plot

    kmf = KaplanMeierFitter()

    ax = plt.subplot(111)

    pumice = ['water', 'BBG']

    pum_graph = survival.loc[survival['group'].isin(pumice)]

    for name, grouped_survival in pum_graph.groupby('group'):
        kmf.fit(grouped_survival['time'],
                grouped_survival['event'],
                label=name)
        kmf.plot(ax=ax, ci_show=False, marker='o')

    plt.xlabel("days")
    plt.ylabel("survival proportion")
    plt.ylim(-0.05, 1.05)

    return fig_to_uri(plt)
Esempio n. 24
0
def plot(out,
         fontsize=12,
         savepath='',
         width=10,
         height=6,
         cmap='Set1',
         cii_alpha=0.05,
         cii_lines='dense',
         methodtype='lifeline',
         title='Survival function',
         full_ylim=False,
         y_percentage=False):
    """Make plot.

    Parameters
    ----------
    out : dict
        Results from the fit function.
    fontsize : int, optional
        Font size for the graph. The default is 12.
    savepath : String, optional
        Path to store the figure. The default is ''.
    width : int, optional
        Width of the figure. The default is 10.
    height : int, optional
        height of the figure. The default is 6.
    cmap : String, optional
        Specify your own colors for each class-label or use a colormap:  https://matplotlib.org/examples/color/colormaps_reference.html. The default is 'Set1'.
        [(1, 0, 0),(0, 0, 1),(..)]
        'Set1'       (default)
        'Set2'       Discrete colors
        'Pastel1'    Discrete colors
        'Paired'     Discrete colors
        'rainbow'
        'bwr'        Blue-white-red
        'binary' or 'binary_r'
        'seismic'    Blue-white-red
        'Blues'      white-to-blue
        'Reds'       white-to-red
    cii_alpha : float, optional
        Confidence interval (works only when methodtype='lifelines'). The default is 0.05.
    cii_lines : String, optional
        Confidence lines (works only when methodtype='lifelines'). The default is 'dense'.
        'lifelines' (default)
        'custom'
    methodtype : String, optional
        Implementation type. The default is 'lifeline'.
        'dense'   (dense/filled lines)
        'line'
         None  (no lines)
    title : TYPE, optional
        DESCRIPTION. The default is 'Survival function'.

    Returns
    -------
    None.

    """
    KMcoord = {}
    Param = {}
    Param['width'] = width
    Param['height'] = height
    Param['fontsize'] = fontsize
    Param['savepath'] = savepath
    labx = out['labx']

    # Combine data and gather class labels
    data = np.vstack((out['time_event'], out['censoring'])).T

    # Make colors and legend-names for class-labels
    [class_colors, classlabel] = make_class_color_names(data,
                                                        out['labx'],
                                                        out['uilabx'],
                                                        cmap=cmap)

    if methodtype == 'lifeline':
        # Init
        kmf_all = []

        # Startup figure
        fig = plt.figure(figsize=(Param['width'], Param['height']))
        ax = fig.add_subplot(111)
        if full_ylim:
            ax.set_ylim([0.0, 1.05])
        if y_percentage:
            ax.yaxis.set_major_formatter(PercentFormatter(1.0))
        if out['logrank'] != []:
            plt.title('%s, Logrank Test P-Value = %.5f' %
                      (title, out['logrank_P']))

        # Compute KM survival coordinates per class
        if cii_lines == 'dense':
            cii_lines = False
        if cii_lines == 'line':
            cii_lines = True
        if cii_lines == '' or cii_lines == None or cii_alpha == None:
            cii_lines = False
            cii_alpha = 0

        for i in range(0, len(out['uilabx'])):
            kmf = KaplanMeierFitter()
            idx = np.where(labx == out['uilabx'][i])[0]
            # Fit
            kmf.fit(out['time_event'][idx],
                    event_observed=out['censoring'][idx],
                    label=classlabel[i],
                    ci_labels=None,
                    alpha=(1 - cii_alpha))
            # Plot
            kmf.plot(ax=ax,
                     ci_force_lines=cii_lines,
                     color=class_colors[i],
                     show_censors=True)
            # Store
            kmf_all.append(
                kmf.fit(out['time_event'][idx],
                        event_observed=out['censoring'][idx],
                        label=classlabel[i],
                        ci_labels=None,
                        alpha=(1 - cii_alpha)))

        add_at_risk_counts(*kmf_all, ax=ax)

        ax.tick_params(axis='x',
                       length=15,
                       width=1,
                       direction='out',
                       labelsize=Param['fontsize'])
        ax.tick_params(axis='y',
                       length=15,
                       width=1,
                       direction='out',
                       labelsize=Param['fontsize'])
        ax.spines['bottom'].set_position(['outward', Param['fontsize']])
        ax.spines['left'].set_position(['outward', Param['fontsize']])
        #    ax.rc('font', size= Param['fontsize'])   # controls default text sizes
        #    ax.rc('axes',  labelsize = Param['fontsize'])  # fontsize of the x and y labels

        if Param['savepath'] != '':
            savefig(fig, Param['savepath'])

    if methodtype == 'custom':
        # Compute KM survival coordinates per class
        for i in range(0, len(out['uilabx'])):
            idx = np.where(labx == out['uilabx'][i])[0]
            tmpdata = data[idx, :].tolist()
            KMcoord[i] = compute_coord(tmpdata)

        # Plot KM survival lines
        plotkm(KMcoord,
               classlabel,
               cmap=class_colors,
               width=Param['width'],
               height=Param['height'],
               fontsize=Param['fontsize'])
Esempio n. 25
0
out = km.fit(time_event, censoring, labx) # Direct grouped lines
km.plot(out)

# %% [markdown]
# # Kaplan-Meier curve using _lifelines_
#
# We can have greater control over the KM curve using the _lifelines_ package. This package follows the 
# coding style of packages like _scikit-learn_ in that we first start a fitting object, then we 
# fit the model to the data and then plot it
#
# This plotting uses a matplotlib backend
# %% lifelines

from lifelines import KaplanMeierFitter 

kmf = KaplanMeierFitter() # Kaplan Meier fitting object
kmf.fit(time_event, event_observed=censoring) # Fit model to data

kmf.plot(at_risk_counts=False) # No table at first
plt.title('Kaplan-Meier Curve')
plt.show();

# %% [markdown]
# # Kaplan-Meier curve using _lifelines_
#
# We can clean this curve up a bit.

# %% Kaplan-Meier via lifelines
ax = kmf.plot()
ax.set_xlabel('days')
ax.set_ylabel('Probability of survival')
import os
import sys
import numpy as np
from select_common import select_segments_broad, read_annotations
from lifelines import KaplanMeierFitter
import pickle

data_dir = os.path.normpath(os.path.join(sys.path[0], '../data-new/'))
out_file = os.path.join(data_dir, 'interim', 'survival-model.pickle')

annotations = read_annotations(data_dir)
durations = []
observeds = []

for annotation in annotations.values():
    (indices, Y) = select_segments_broad(annotation)
    for section in Y[:-1]:
        durations += section
        observeds += [True] * len(section)
    durations += Y[-1]
    observeds += [False] * len(Y[-1])

km = KaplanMeierFitter()
km = km.fit(durations, event_observed=observeds)
with open(out_file, 'wb') as out_handle:
    pickle.dump(km, out_handle)
Esempio n. 27
0
####EDIT THIS TO CHANGE WHICH SEASON YOU TRAIN THE MODEL ON#####
trainingSeasons = [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]

trainData = allSeasonsByEpisode.loc[allSeasonsByEpisode["Season"].isin(
    trainingSeasons)]

kmfdata = pd.DataFrame({
    'Age':
    allSeasonsByEpisode.groupby(['ID']).Age.first(),
    'Duration':
    allSeasonsByEpisode.groupby(['ID']).End.max(),
    'Observed':
    allSeasonsByEpisode.groupby(['ID']).Out.sum() == 1
})

kmf = KaplanMeierFitter()
kmf.fit(kmfdata["Duration"], event_observed=kmfdata["Observed"])

#Segment the surival curve based on age group
ax = plt.subplot(111)
age0 = kmfdata["Age"] < 30
age1 = (kmfdata["Age"] < 40) & (kmfdata["Age"] >= 30)
age2 = kmfdata["Age"] >= 40
kmf.fit(kmfdata["Duration"][age0],
        event_observed=kmfdata["Observed"][age0],
        label="Age < 30")
kmf.plot(ax=ax)
kmf.fit(kmfdata["Duration"][age1],
        event_observed=kmfdata["Observed"][age1],
        label=" 30 <= Age < 40")
kmf.plot(ax=ax)
Esempio n. 28
0
def run_kaplan_meier(run_parameters):
    """ save the lifelines kaplan-meier graphical analysis and p-value to two files

    Args:
        run_parameters: with keys:
                        results_directory
                        phenotype_file_name (containing the following column names)
                        cluster_id
                        event_id
                        time_id

    Returns:
        Writes:         two time-stamped files named after the phenotype file and "kaplan-meier"
                        "png" (640 x 480) image of the lifelines kaplan-meier graphical analysis
                        one cell dataframe with the p-value of the multivariate logrank test
    """
    results_directory = run_parameters['results_directory']
    phenotype_file_name = run_parameters['phenotype_file_name']
    cluster_id = run_parameters['cluster_id']
    event_id = run_parameters['event_id']
    time_id = run_parameters['time_id']

    phenotype_df = kn.get_spreadsheet_df(phenotype_file_name)

    T = phenotype_df[time_id]
    C = phenotype_df[event_id]

    results = multivariate_logrank_test(T,
                                        phenotype_df[cluster_id],
                                        C,
                                        alpha=0.99)
    p_value = str('%g' % (results.p_value))
    test_name = 'multivariate_logrank_test'

    Clusters = sorted(phenotype_df[cluster_id].unique())
    num_clusters = len(Clusters)

    plt.clf()
    ax = plt.subplot(111)

    kmf = KaplanMeierFitter()
    for cluster in Clusters:
        ixc = phenotype_df[cluster_id] == cluster
        kmf.fit(T.ix[ixc], C.ix[ixc], label=cluster + 1)
        kmf.plot(ax=ax, show_censors=True, ci_show=False)

    plt.title('number of clusters = %s' % (num_clusters))
    plt.xlabel('Time (days)')
    plt.ylabel('OS')

    transform_name = "kaplan_meier"
    kaplan_meier_spreadsheet_df = pd.DataFrame(data=p_value,
                                               index=[test_name],
                                               columns=['p_value'])

    write_transform_df(kaplan_meier_spreadsheet_df, phenotype_file_name,
                       transform_name + '_p_value', results_directory)
    result_name = get_outfile_name(results_directory,
                                   phenotype_file_name,
                                   transform_name + '_graphic',
                                   file_ext='png')
    plt.savefig(result_name, dpi=100)
Esempio n. 29
0
def multivariate_logrank_test(
        event_durations,
        groups,
        event_observed=None,
        t_0=-1,
        weightings=None,
        **kwargs) -> StatisticalResult:  # pylint: disable=too-many-locals
    r"""
    This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
    be equal when n=2):

    .. math::
        \begin{align}
         & H_0: h_1(t) = h_2(t) = h_3(t) = ... = h_n(t) \\
         & H_A: \text{there exist at least one group that differs from the other.}
        \end{align}


    Parameters
    ----------

    event_durations: iterable
        a (n,) list-like representing the (possibly partial) durations of all individuals

    groups: iterable
        a (n,) list-like of unique group labels for each individual.

    event_observed: iterable, optional
        a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed.

    t_0: float, optional (default=-1)
        the period under observation, -1 for all time.

    weightings: str, optional
        apply a weighted logrank test: options are "wilcoxon" for Wilcoxon (also known as Breslow), "tarone-ware"
        for Tarone-Ware, "peto" for Peto test and "fleming-harrington" for Fleming-Harrington test.
        These are useful for testing for early or late differences in the survival curve. For the Fleming-Harrington
        test, keyword arguments p and q must also be provided with non-negative values.

        Weightings are applied at the ith ordered failure time, :math:`t_{i}`, according to:
            Wilcoxon: :math:`n_i`
            Tarone-Ware: :math:`\sqrt{n_i}`
            Peto: :math:`\bar{S}(t_i)`
            Fleming-Harrington: :math:`\hat{S}(t_i)^p \times (1 - \hat{S}(t_i))^q`

            where :math:`n_i` is the number at risk just prior to time :math:`t_{i}`, :math:`\bar{S}(t_i)` is
            Peto-Peto's modified survival estimate and :math:`\hat{S}(t_i)` is the left-continuous
            Kaplan-Meier survival estimate at time :math:`t_{i}`.

    kwargs:
        add keywords and meta-data to the experiment summary.


    Returns
    -------

    StatisticalResult
       a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary``

    Examples
    --------

    .. code:: python

        df = pd.DataFrame({
           'durations': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
           'events': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
           'groups': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        })
        result = multivariate_logrank_test(df['durations'], df['groups'], df['events'])
        result.test_statistic
        result.p_value
        result.print_summary()


        # numpy example
        G = [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
        T = [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7]
        E = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0]
        result = multivariate_logrank_test(T, G, E)
        result.test_statistic


    See Also
    --------
    pairwise_logrank_test
    logrank_test
    """
    kwargs.setdefault("test_name", "multivariate_logrank_test")

    event_durations, groups = np.asarray(event_durations), np.asarray(groups)
    if event_observed is None:
        event_observed = np.ones((event_durations.shape[0], 1))
    else:
        event_observed = np.asarray(event_observed)

    n = np.max(event_durations.shape)
    assert n == np.max(event_durations.shape) == np.max(
        event_observed.shape), "inputs must be of the same length."
    groups, event_durations, event_observed = map(
        lambda x: pd.Series(np.asarray(x).reshape(n)),
        [groups, event_durations, event_observed])

    unique_groups, rm, obs, _ = group_survival_table_from_events(
        groups, event_durations, event_observed, limit=t_0)
    n_groups = unique_groups.shape[0]

    # compute the factors needed
    n_ij = rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0)
    d_i = obs.sum(1)
    n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0)
    ev_i = n_ij.mul(d_i / n_i, axis="index")

    # compute weightings for log-rank alternatives
    if weightings is None:
        w_i = np.ones(d_i.shape[0])
    elif weightings == "wilcoxon":
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Wilcoxon")
        w_i = n_i
    elif weightings == "tarone-ware":
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Tarone-Ware")
        w_i = np.sqrt(n_i)
    elif weightings == "peto":
        kwargs["test_name"] = kwargs["test_name"].replace("logrank", "Peto")
        w_i = np.cumprod(1.0 - (ev_i.sum(1)) /
                         (n_i + 1))  # Peto-Peto's modified survival estimates.
    elif weightings == "fleming-harrington":
        if "p" in kwargs:
            p = kwargs["p"]
            if p < 0:
                raise ValueError("p must be non-negative.")
        else:
            raise ValueError(
                "Must provide keyword argument p for Flemington-Harrington test statistic"
            )
        if "q" in kwargs:
            q = kwargs["q"]
            if q < 0:
                raise ValueError("q must be non-negative.")
        else:
            raise ValueError(
                "Must provide keyword argument q for Flemington-Harrington test statistic"
            )
        kwargs["test_name"] = kwargs["test_name"].replace(
            "logrank", "Flemington-Harrington")
        kmf = KaplanMeierFitter().fit(event_durations,
                                      event_observed=event_observed)
        s = kmf.survival_function_.to_numpy().flatten(
        )[:-1]  # Left-continuous Kaplan-Meier survival estimate.
        w_i = np.power(s, p) * np.power(1.0 - s, q)
    else:
        raise ValueError("Invalid value for weightings.")

    # apply weights to observed and expected
    N_j = obs.mul(w_i, axis=0).sum(0).values
    ev = ev_i.mul(w_i, axis=0).sum(0)

    # vector of observed minus expected
    Z_j = N_j - ev

    assert abs(Z_j.sum(
    )) < 10e-8, "Sum is not zero."  # this should move to a test eventually.

    # compute covariance matrix
    factor = (((n_i - d_i) /
               (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2
    n_ij["_"] = n_i.values
    V_ = (n_ij.mul(w_i, axis=0)).mul(np.sqrt(factor),
                                     axis="index").fillna(0)  # weighted V_
    V = -np.dot(V_.T, V_)
    ix = np.arange(n_groups)
    V[ix, ix] = V[ix, ix] - V[-1, ix]
    V = V[:-1, :-1]

    # take the first n-1 groups
    U = Z_j.iloc[:-1] @ np.linalg.pinv(
        V[:-1, :-1]) @ Z_j.iloc[:-1]  # Z.T*inv(V)*Z

    # compute the p-values and tests
    p_value = _chisq_test_p_value(U, n_groups - 1)
    return StatisticalResult(p_value,
                             U,
                             t_0=t_0,
                             null_distribution="chi squared",
                             degrees_of_freedom=n_groups - 1,
                             **kwargs)
Esempio n. 30
0
    def plot_survival(self):

        df = super().load_data(
            col=[
                'YR_BRTH', 'AGE_DX', 'LATERAL', 'RADIATN', 'HISTREC',
                'ERSTATUS', 'PRSTATUS', 'BEHANAL', 'HST_STGA', 'NUMPRIMS',
                'SRV_TIME_MON', 'SRV_TIME_MON_PA', 'DTH_CLASS', 'O_DTH_CLASS',
                'STAT_REC'
            ],
            cond=
            'SRV_TIME_MON < 1000 AND HST_STGA < 8 AND DTH_CLASS < 9 AND ERSTATUS < 4 AND PRSTATUS < 4',
            sample_size=100000)

        kmf = KaplanMeierFitter()

        try:
            df.RADIATN = df.RADIATN.replace(7, 0)
            df = df[df.RADIATN < 7]
        except Exception as err:
            pass

        # 0-negative, 1-borderline,, 2-positive
        df = df[df.ERSTATUS != 4]
        df = df[df.ERSTATUS != 9]
        df.ERSTATUS = df.ERSTATUS.replace(2, 0)
        df.ERSTATUS = df.ERSTATUS.replace(1, 2)
        df.ERSTATUS = df.ERSTATUS.replace(3, 1)

        # 0-negative, 1-borderline,, 2-positive
        df = df[df.PRSTATUS != 4]
        df = df[df.PRSTATUS != 9]
        df.PRSTATUS = df.PRSTATUS.replace(2, 0)
        df.PRSTATUS = df.PRSTATUS.replace(1, 2)
        df.PRSTATUS = df.PRSTATUS.replace(3, 1)

        rad = df.RADIATN > 0
        er = df.ERSTATUS > 0
        pr = df.PRSTATUS > 0

        st0 = df.HST_STGA == 0
        st1 = df.HST_STGA == 1
        st2 = df.HST_STGA == 2
        st4 = df.HST_STGA == 4

        age = df.AGE_DX < 50

        #print(df.head())
        #print(rad.head())
        #print(er.head())
        #print(st.head())

        df['SRV_TIME_YR'] = df['SRV_TIME_MON'] / 12
        T = df['SRV_TIME_YR']
        #C = (np.logical_or(df.DTH_CLASS == 1, df.O_DTH_CLASS == 1))
        C = df.STAT_REC == 4

        #print(T.head(20))
        #print(C.head(20))
        #print(df.DTH_CLASS.head(20))
        #print(df.O_DTH_CLASS.head(20))
        #print(df.describe())

        f, ax = plt.subplots(5, sharex=True, sharey=True)
        ax[0].set_title("Lifespans of cancer patients")

        # radiation
        kmf.fit(T[rad], event_observed=C[rad], label="Radiation")
        kmf.plot(ax=ax[0])  #, ci_force_lines=True)
        kmf.fit(T[~rad], event_observed=C[~rad], label="No Radiation")
        kmf.plot(ax=ax[0])  #, ci_force_lines=True)

        # ER Status
        kmf.fit(T[er], event_observed=C[er], label="ER Positive")
        kmf.plot(ax=ax[1])  #, ci_force_lines=True)
        kmf.fit(T[~er], event_observed=C[~er], label="ER Negative")
        kmf.plot(ax=ax[1])  #, ci_force_lines=True)

        # PR Status
        kmf.fit(T[pr], event_observed=C[pr], label="PR Positive")
        kmf.plot(ax=ax[2])  #, ci_force_lines=True)
        kmf.fit(T[~pr], event_observed=C[~pr], label="PR Negative")
        kmf.plot(ax=ax[2])  #, ci_force_lines=True)

        # stage
        kmf.fit(T[st0], event_observed=C[st0], label="Stage 0")
        kmf.plot(ax=ax[3])  #, ci_force_lines=True)
        kmf.fit(T[st1], event_observed=C[st1], label="Stage 1")
        kmf.plot(ax=ax[3])  #, ci_force_lines=True)
        kmf.fit(T[st2], event_observed=C[st2], label="Stage 2")
        kmf.plot(ax=ax[3])  #, ci_force_lines=True)
        kmf.fit(T[st4], event_observed=C[st4], label="Stage 4")
        kmf.plot(ax=ax[3])  #, ci_force_lines=True)

        # age
        kmf.fit(T[age], event_observed=C[age], label="Age < 50")
        kmf.plot(ax=ax[4])  #, ci_force_lines=True)
        kmf.fit(T[~age], event_observed=C[~age], label="Age >= 50")
        kmf.plot(ax=ax[4])  #, ci_force_lines=True)

        ax[0].legend(loc=3, prop={'size': 10})
        ax[1].legend(loc=3, prop={'size': 10})
        ax[2].legend(loc=3, prop={'size': 10})
        ax[3].legend(loc=3, prop={'size': 10})
        ax[4].legend(loc=3, prop={'size': 10})

        ax[len(ax) - 1].set_xlabel('Survival in years')

        f.text(0.04, 0.5, 'Survival %', va='center', rotation='vertical')
        plt.tight_layout()

        plt.ylim(0, 1)
        plt.show()

        f, ax = plt.subplots(2, sharex=True, sharey=True)

        df.hist('SRV_TIME_YR', by=df.STAT_REC != 4, ax=(ax[0], ax[1]))
        ax[0].set_title('Histogram of Non Censored Patients')
        ax[0].set_ylabel('Number of Patients')

        ax[1].set_ylabel('Number of Patients')
        ax[1].set_title('Histogram of Censored Patients')
        ax[1].set_xlabel('Survival in Years')
        plt.show()

        return

        # second plot of survival

        fig, ax = plt.subplots(figsize=(8, 6))

        cen = df[df.STAT_REC != 4].SRV_TIME_MON
        nc = df[df.STAT_REC == 4].SRV_TIME_MON
        cen = cen.sort_values()
        nc = nc.sort_values()

        ax.hlines([x for x in range(len(nc))],
                  0,
                  nc,
                  color='b',
                  label='Uncensored')
        ax.hlines([x for x in range(len(nc),
                                    len(nc) + len(cen))],
                  0,
                  cen,
                  color='r',
                  label='Censored')

        ax.set_xlim(left=0)
        ax.set_xlabel('Months')
        ax.set_ylim(-0.25,
                    len(df) + 0.25)
        ax.legend(loc='best')
        plt.show()

        return