cph.predict_partial_hazard(test_X) survival_result = cph.predict_survival_function(test_X) survival_result = survival_result[survival_result <= 0.5] LOSResult = pd.DataFrame(np.arange(1354).reshape((677, 2)), columns=['Id', 'LOS']) i = 0 for c in survival_result.columns: item = survival_result[c].idxmax() LOSResult.iloc[i, 0] = i LOSResult.iloc[i, 1] = item i = i + 1 test['Id'] = test.index test1 = pd.merge(test, LOSResult, on='Id') fig, ax = plt.subplots(figsize=(12, 12)) from lifelines import KaplanMeierFitter kmf_control = KaplanMeierFitter() ax = kmf_control.fit(test1['术后住院时间'], label='Real').plot(ax=ax, color='#C32B4A') kmf_exp = KaplanMeierFitter() ax = kmf_exp.fit(test1['LOS'], label='Predicted').plot(ax=ax, color='#3F76B4') font2 = { 'family': 'Times New Roman', 'weight': 'normal', 'size': 28, } plt.xlabel('Postoperative hospital stay (days)', font2) plt.ylabel('Percent hospitalized', font2) ax.spines['left'].set_position(('outward', 0.2)) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_position(('outward', 0.2))
def qq_plot(model, ax=None, **plot_kwargs): """ Produces a quantile-quantile plot of the empirical CDF against the fitted parametric CDF. Large deviances away from the line y=x can invalidate a model (though we expect some natural deviance in the tails). Parameters ----------- model: obj A fitted lifelines univariate parametric model, like ``WeibullFitter`` plot_kwargs: kwargs for the plot. Returns -------- ax: The axes which was used. Examples --------- >>> from lifelines import * >>> from lifelines.plotting import qq_plot >>> from lifelines.datasets import load_rossi >>> df = load_rossi() >>> wf = WeibullFitter().fit(df['week'], df['arrest']) >>> qq_plot(wf) """ from lifelines.utils import qth_survival_times from lifelines import KaplanMeierFitter if ax is None: ax = plt.gca() dist = get_distribution_name_of_lifelines_model(model) dist_object = create_scipy_stats_model_from_lifelines_model(model) COL_EMP = "empirical quantiles" COL_THEO = "fitted %s quantiles" % dist if CensoringType.is_left_censoring(model): kmf = KaplanMeierFitter().fit_left_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_right_censoring(model): kmf = KaplanMeierFitter().fit_right_censoring(model.durations, model.event_observed, label=COL_EMP) elif CensoringType.is_interval_censoring(model): raise NotImplementedError("lifelines does not have a non-parametric interval model yet.") q = np.unique(kmf.cumulative_density_.values[:, 0]) # this is equivalent to the old code `qth_survival_times(q, kmf.cumulative_density, cdf=True)` quantiles = qth_survival_times(1 - q, kmf.survival_function_) quantiles[COL_THEO] = dist_object.ppf(q) quantiles = quantiles.replace([-np.inf, 0, np.inf], np.nan).dropna() max_, min_ = quantiles[COL_EMP].max(), quantiles[COL_EMP].min() quantiles.plot.scatter(COL_THEO, COL_EMP, c="none", edgecolor="k", lw=0.5, ax=ax) ax.plot([min_, max_], [min_, max_], c="k", ls=":", lw=1.0) ax.set_ylim(min_, max_) ax.set_xlim(min_, max_) return ax
def survival_difference_at_fixed_point_in_time_test( point_in_time, durations_A, durations_B, event_observed_A=None, event_observed_B=None, **kwargs) -> StatisticalResult: """ Often analysts want to compare the survival-ness of groups at specific times, rather than comparing the entire survival curves against each other. For example, analysts may be interested in 5-year survival. Statistically comparing the naive Kaplan-Meier points at a specific time actually has reduced power (see [1]). By transforming the Kaplan-Meier curve, we can recover more power. This function uses the log(-log) transformation. Parameters ---------- point_in_time: float, the point in time to analyze the survival curves at. durations_A: iterable a (n,) list-like of event durations (birth to death,...) for the first population. durations_B: iterable a (n,) list-like of event durations (birth to death,...) for the second population. event_observed_A: iterable, optional a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the first population. Default assumes all observed. event_observed_B: iterable, optional a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the second population. Default assumes all observed. kwargs: add keywords and meta-data to the experiment summary Returns ------- StatisticalResult a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary`` Examples -------- .. code:: python T1 = [1, 4, 10, 12, 12, 3, 5.4] E1 = [1, 0, 1, 0, 1, 1, 1] T2 = [4, 5, 7, 11, 14, 20, 8, 8] E2 = [1, 1, 1, 1, 1, 1, 1, 1] from lifelines.statistics import survival_difference_at_fixed_point_in_time_test results = survival_difference_at_fixed_point_in_time_test(12, T1, T2, event_observed_A=E1, event_observed_B=E2) results.print_summary() print(results.p_value) # 0.893 print(results.test_statistic) # 0.017 Notes ----- Other transformations are possible, but Klein et al. [1] showed that the log(-log(c)) transform has the most desirable statistical properties. References ----------- [1] Klein, J. P., Logan, B. , Harhoff, M. and Andersen, P. K. (2007), Analyzing survival curves at a fixed point in time. Statist. Med., 26: 4505-4519. doi:10.1002/sim.2864 """ kmfA = KaplanMeierFitter().fit(durations_A, event_observed=event_observed_A) kmfB = KaplanMeierFitter().fit(durations_B, event_observed=event_observed_B) sA_t = kmfA.predict(point_in_time) sB_t = kmfB.predict(point_in_time) # this is doing a prediction/interpolation between the kmf's index. sigma_sqA = interpolate_at_times_and_return_pandas(kmfA._cumulative_sq_, point_in_time) sigma_sqB = interpolate_at_times_and_return_pandas(kmfB._cumulative_sq_, point_in_time) log = np.log clog = lambda s: log(-log(s)) X = (clog(sA_t) - clog(sB_t))**2 / (sigma_sqA / log(sA_t)**2 + sigma_sqB / log(sB_t)**2) p_value = _chisq_test_p_value(X, 1) return StatisticalResult( p_value, X, null_distribution="chi squared", degrees_of_freedom=1, point_in_time=point_in_time, test_name="survival_difference_at_fixed_point_in_time_test", **kwargs)
def plot3(df): import sys #get_ipython().system('{sys.executable} -m pip install lifelines') #install pandas and matlab plot import pandas as pd import matplotlib.pyplot as plt from lifelines import KaplanMeierFitter # import os # os.chdir("/Users/MDONEGAN/Downloads") #survival= pd.read_csv("/Users/MDONEGAN/Downloads/Book2.csv", sep=',') survival = df from lifelines.statistics import pairwise_logrank_test results = pairwise_logrank_test(survival['time'], survival['group'], survival['event']) results.print_summary() #%% # this util converts a table with "death" and "censored" (alive) into the lifelines format from lifelines import KaplanMeierFitter from lifelines.utils import survival_events_from_table kmf = KaplanMeierFitter() ax = plt.subplot(111) #df = pd.read_csv('/Users/MDONEGAN/Downloads/counts.csv') df = df.set_index('time') T, E, W = survival_events_from_table(df, observed_deaths_col='death', censored_col='censored') kmf.fit(T, E, weights=W) kmf.plot(ax=ax, ci_show=True, marker='o') plt.xlabel("days") plt.ylabel("survival %") plt.ylim(0.4, 1.05) #%% #trying to combine the grouping function and the events from table function from lifelines import KaplanMeierFitter from lifelines.utils import survival_events_from_table kmf = KaplanMeierFitter() ax = plt.subplot(111) #df = pd.read_csv('/Users/MDONEGAN/Downloads/counts.csv') df = df.set_index('time') T, E, W = survival_events_from_table(df, observed_deaths_col='death', censored_col='censored') print(E) #group dataset by treatment and plot all groups (treatments) using kmf fit for name, T_group, E_group, W_group in T, E, W.groupby('group'): kmf.fit(grouped_survival['T'], grouped_survival['E'], label=name) kmf.plot(ax=ax, ci_show=False, marker='o') plt.xlabel("days") plt.ylabel("survival %") plt.ylim(0.4, 1.05) return fig_to_uri(plt)
def show_survival_curve(df, t_col, y_col, max_time=None, weight=None, save_file=None): plt.figure(figsize=(8, 6)) plt.rcParams["font.size"] = 14 colors = ['blue', 'red', 'magenta'] tr_uniq = np.sort(df[t_col].astype(int).unique()) max_time = df[y_col].max() if max_time is None else max_time time = df[y_col].values event = np.where(df[y_col] < max_time, 1, 0) verbose_days = [ 0, int((max_time - 1) / 3), int((max_time - 1) * 2 / 3), int(max_time) - 1 ] for d in verbose_days: plt.text(d, 0.6, f'RR({d}day)', horizontalalignment='center', verticalalignment='center') curve_list = [] elapsed_days = np.array([i for i in range(int(max_time))]) kmf = KaplanMeierFitter() for i, tr in enumerate(tr_uniq): t_idx = (df[t_col] == tr) if weight is None: kmf.fit(time[t_idx], event[t_idx], label=f'tr={tr}') else: kmf.fit(time[t_idx], event[t_idx], label=f'tr={tr}', weights=weight[t_idx]) curve_list.append(kmf.survival_function_at_times(elapsed_days)) ax = kmf.plot(c=colors[i]) for d in verbose_days: surv_prob = kmf.survival_function_at_times(d).values[0] ax = plt.scatter(d, surv_prob, marker='o', c=colors[i]) ax = plt.text(d, 0.6 - 0.02 * (i + 1), f'{surv_prob :.3f}', c=colors[i], horizontalalignment='center', verticalalignment='center') plt.xlim(-3, int(max_time) + 3) plt.ylim(0.5, 1.05) plt.xlabel('Followed days (elapsed days)') plt.ylabel('Survival probability (retention rate)') plt.legend(loc='best') plt.grid() plt.tight_layout() if save_file is not None: plt.savefig(save_file) plt.show() return (np.array(curve_list[1]) - np.array(curve_list[0])).reshape(-1)
def categorical_km_curves(feature, t='Tenure', event='Churn', df=df, ax=None): for cat in sorted(df[feature].unique(), reverse=True): idx = df[feature] == cat kmf = KaplanMeierFitter() kmf.fit(df[idx][t], event_observed=df[idx][event], label=cat) kmf.plot(ax=ax, label=cat, ci_show=True, c=colours[cat])
def fit( self, durations, event_observed, event_of_interest, timeline=None, entry=None, label="AJ_estimate", alpha=None, ci_labels=None, weights=None, ): # pylint: disable=too-many-arguments,too-many-locals """ Parameters ---------- durations: an array or pd.Series of length n -- duration of subject was observed for event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be only positive integers, where 0 indicates censoring. event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2) is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer timeline: return the best estimate at the values in timelines (positively increasing) entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is useful for left-truncated (not left-censored) observations. If None, all members of the population were born at time 0. label: a string to name the column of the estimate. alpha: the alpha value in the confidence intervals. Overrides the initializing alpha for this call to fit only. ci_labels: add custom column names to the generated confidence intervals as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<1-alpha/2> weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead of providing every subject as a single element of `durations` and `event_observed`, one could weigh subject differently. Returns ------- self : AalenJohansenFitter self, with new properties like ``cumulative_incidence_``. """ # Checking for tied event times ties = self._check_for_duplicates(durations=durations, events=event_observed) if ties: warnings.warn( dedent( """Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times. To resolve ties, data is randomly jittered."""), Warning, ) durations = self._jitter( durations=pd.Series(durations), event=pd.Series(event_observed), jitter_level=self._jitter_level, seed=self._seed, ) alpha = alpha if alpha else self.alpha # Creating label for event of interest & indicator for that event event_of_interest = int(event_of_interest) cmprisk_label = "CIF_" + str(event_of_interest) self.label_cmprisk = "observed_" + str(event_of_interest) # Fitting Kaplan-Meier for either event of interest OR competing risk km = KaplanMeierFitter().fit(durations, event_observed=event_observed, timeline=timeline, entry=entry, weights=weights) aj = km.event_table aj["overall_survival"] = km.survival_function_ aj["lagged_overall_survival"] = aj["overall_survival"].shift() # Setting up table for calculations and to return to user event_spec = pd.Series(event_observed) == event_of_interest self.durations, self.event_observed, *_, event_table, weights = _preprocess_inputs( durations=durations, event_observed=event_spec, timeline=timeline, entry=entry, weights=weights) event_spec_times = event_table["observed"] event_spec_times = event_spec_times.rename(self.label_cmprisk) aj = pd.concat([aj, event_spec_times], axis=1).reset_index() # Estimator of Cumulative Incidence (Density) Function aj[cmprisk_label] = (aj[self.label_cmprisk] / aj["at_risk"] * aj["lagged_overall_survival"]).cumsum() aj.loc[0, cmprisk_label] = 0 # Setting initial CIF to be zero aj = aj.set_index("event_at") # Setting attributes self._estimation_method = "cumulative_density_" self._estimate_name = "cumulative_density_" self.timeline = km.timeline self._update_docstrings() self._label = label self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label]) # Technically, cumulative incidence, but consistent with KaplanMeierFitter self.event_table = aj[[ "removed", "observed", self.label_cmprisk, "censored", "entrance", "at_risk" ]] # Event table if self._calc_var: self.variance_, self.confidence_interval_ = self._bounds( aj["lagged_overall_survival"], alpha=alpha, ci_labels=ci_labels) else: self.variance_, self.confidence_interval_ = None, None self.confidence_interval_cumulative_density_ = self.confidence_interval_ return self
MIN_2 = np.percentile(T_actual, 30) MIN_3 = np.percentile(T_actual, 50) T = T_actual.copy() ix = np.random.randint(4, size=N) T = np.where(ix == 0, np.maximum(T, MIN_0), T) T = np.where(ix == 1, np.maximum(T, MIN_1), T) T = np.where(ix == 2, np.maximum(T, MIN_2), T) T = np.where(ix == 3, np.maximum(T, MIN_3), T) E = T_actual == T fig, axes = plt.subplots(2, 2, figsize=(9, 5)) axes = axes.reshape(4) for i, model in enumerate([WeibullFitter(), KaplanMeierFitter(), LogNormalFitter(), LogLogisticFitter()]): if isinstance(model, KaplanMeierFitter): model.fit(T, E, left_censorship=True, label=model.__class__.__name__) else: model.fit(T, E, left_censorship=True, label=model.__class__.__name__) model.plot_cumulative_density(ax=axes[i]) plt.tight_layout() for i, model in enumerate([WeibullFitter(), LogNormalFitter(), LogLogisticFitter()]): model.fit(T, E, left_censorship=True) fig, axes = plt.subplots(2, 1, figsize=(8, 6)) left_censorship_cdf_plot(model, ax=axes[0]) qq_plot(model, ax=axes[1])
def index_of_survival(request: HttpRequest, all_parameter: str): """ response = { data = } """ mm = all_parameter.split("&") st = mm[0].split("=")[1] if ("," in mm[1].split("=")[1]): ct = mm[1].split("=")[1].split(",") else: ct = [mm[1].split("=")[1]] b = API.DatabaseAPI("tcga") my_dict_b = b.query_collection_obs() my_df_b = pd.DataFrame(my_dict_b) select_part = my_df_b.loc[my_df_b["primary_disease"].isin(ct), :] if len(ct) > 8: response = { "error": "Too many datasets. You can select no more than eight datasets." } return JsonResponse(response) ref = mm[5].split("=")[1] if ("," in mm[2].split("=")[1]): cell = mm[2].split("=")[1].split(",") else: cell = [mm[2].split("=")[1]] up = mm[3].split("=")[1] dn = mm[4].split("=")[1] select = select_part["primary_disease"].tolist() if ref == "EPIC": columns_list = ["EPIC_cellFractions." + i for i in cell] ref = API.DatabaseAPI("ref") elif ref == "LM": columns_list = ["LM_" + i for i in cell] ref = API.DatabaseAPI("LM_ref") elif ref == "QS": columns_list = ["QS_" + i for i in cell] ref = API.DatabaseAPI("QS_ref") else: response = {"error": "reference error"} return JsonResponse(response) cellID = select_part["cellID"].tolist() my_df_d = select_part.loc[:, columns_list] my_df_d.index = cellID my_df_t = my_df_d.T genes = ref.query_collection_var()["geneSymbol"] gg = ref.query_collection_gene_X_var_by_obs(genes) gg = pd.DataFrame(gg) gg.columns = ref.query_collection_obs()["celltype"] gg_mean = pd.DataFrame(gg.T.mean(axis=1)) gg_mean = gg_mean.loc[cell, :] expression = my_df_t.multiply(gg_mean.values) expression_t = expression.T expression_t = pd.DataFrame(expression_t.sum(axis=1), columns=["sum"]) expression_t = expression_t.sort_values(by=["sum"], ascending=False) number = expression_t.shape[0] number1 = int(number / 100 * (100 - int(up))) number2 = int(number / 100 * (100 - int(dn))) samples = expression_t.index.tolist() sample = [] for each in samples: names = each.split(".") sample.append(names[0] + "." + names[1] + "." + names[2]) up_sample = sample[:number1] dn_sample = sample[number2 + 1:] matches = {"Dead": 1, "Alive": 0, "-": 0} a = API.DatabaseAPI("survival") #up part my_dict_a = a.query_collection_obs() my_df_a = pd.DataFrame(my_dict_a) my_df_a = my_df_a.loc[my_df_a["sample"].isin(up_sample), :] OSEVENT = my_df_a["OSEVENT"].tolist() E = [matches[i] for i in OSEVENT] if st == "OS": T = my_df_a["OSDAY"].tolist() else: T = my_df_a["RFSDAY"].tolist() E_end_up = [E[i] for i in range(len(T)) if T[i] != "-"] T_end = [T[i] for i in range(len(T)) if T[i] != "-"] T_end = list(map(float, T_end)) T_end_up = list(map(lambda x: round(x / 30, 2), T_end)) kmf = KaplanMeierFitter() kmf.fit(T_end_up, E_end_up) sf = kmf.survival_function_.T xa = sf.columns.tolist() y1a = list(map(lambda x: round(x, 3), sf.values[0].tolist())) ci = kmf.confidence_interval_survival_function_.T.values y2a = list(map(lambda x: round(x, 3), ci[1].tolist())) y3a = list(map(lambda x: round(x, 3), ci[0].tolist())) xca = [T_end_up[i] for i in range(len(T_end_up)) if E_end_up[i] == 0] xca = list(map(float, xca)) yca = list( map(lambda x: round(x, 3), kmf.survival_function_at_times(xca).tolist())) #dn part my_dict_a = a.query_collection_obs() my_df_a = pd.DataFrame(my_dict_a) my_df_a = my_df_a.loc[my_df_a["sample"].isin(dn_sample), :] OSEVENT = my_df_a["OSEVENT"].tolist() E = [matches[i] for i in OSEVENT] if st == "OS": T = my_df_a["OSDAY"].tolist() else: T = my_df_a["RFSDAY"].tolist() E_end_dn = [E[i] for i in range(len(T)) if T[i] != "-"] T_end = [T[i] for i in range(len(T)) if T[i] != "-"] T_end = list(map(float, T_end)) T_end_dn = list(map(lambda x: round(x / 30, 2), T_end)) kmf = KaplanMeierFitter() kmf.fit(T_end_dn, E_end_dn) sf = kmf.survival_function_.T xb = sf.columns.tolist() y1b = list(map(lambda x: round(x, 3), sf.values[0].tolist())) ci = kmf.confidence_interval_survival_function_.T.values y2b = list(map(lambda x: round(x, 3), ci[1].tolist())) y3b = list(map(lambda x: round(x, 3), ci[0].tolist())) xcb = [T_end_dn[i] for i in range(len(T_end_dn)) if E_end_dn[i] == 0] xcb = list(map(float, xcb)) ycb = list( map(lambda x: round(x, 3), kmf.survival_function_at_times(xcb).tolist())) results = logrank_test(T_end_up, T_end_dn, event_observed_A=E_end_up, event_observed_B=E_end_dn) pValues1 = float(results.summary["p"].values) dfA = pd.DataFrame({'E': E_end_up, 'T': T_end_up, 'groupA': 1}) dfB = pd.DataFrame({'E': E_end_dn, 'T': T_end_dn, 'groupA': 0}) df = pd.concat([dfA, dfB]) cph = CoxPHFitter().fit(df, 'T', 'E') pValues2 = float(cph.summary["p"].values) response = { "data": [{ "pValues1": pValues1, "pValues2": pValues2 }, { "line": { "dash": "solid", "color": "red", "shape": "hv", "width": 2 }, "mode": "lines", "name": "", "type": "scatter", "x": xa, "y": y1a, "xaxis": "x1", "yaxis": "y1", "showlegend": False }, { "line": { "dash": "dash", "color": "red", "shape": "hv", "width": 2 }, "mode": "lines", "name": "", "type": "scatter", "x": xa, "y": y2a, "xaxis": "x1", "yaxis": "y1", "showlegend": False }, { "line": { "dash": "dash", "color": "red", "shape": "hv", "width": 2 }, "mode": "lines", "name": "", "type": "scatter", "x": xa, "y": y3a, "xaxis": "x1", "yaxis": "y1", "showlegend": False }, { "mode": "markers", "name": "", "text": "", "type": "scatter", "x": xca, "y": yca, "xaxis": "x1", "yaxis": "y1", "marker": { "size": 10, "color": "black", "symbol": "cross-thin-open", "opacity": 1, "sizeref": 1, "sizemode": "area" }, "showlegend": False }, { "line": { "dash": "solid", "color": "blue", "shape": "hv", "width": 2 }, "mode": "lines", "name": "", "type": "scatter", "x": xb, "y": y1b, "xaxis": "x1", "yaxis": "y1", "showlegend": False }, { "line": { "dash": "dash", "color": "blue", "shape": "hv", "width": 2 }, "mode": "lines", "name": "", "type": "scatter", "x": xb, "y": y2b, "xaxis": "x1", "yaxis": "y1", "showlegend": False }, { "line": { "dash": "dash", "color": "blue", "shape": "hv", "width": 2 }, "mode": "lines", "name": "", "type": "scatter", "x": xb, "y": y3b, "xaxis": "x1", "yaxis": "y1", "showlegend": False }, { "mode": "markers", "name": "", "text": "", "type": "scatter", "x": xcb, "y": ycb, "xaxis": "x1", "yaxis": "y1", "marker": { "size": 10, "color": "black", "symbol": "cross-thin-open", "opacity": 1, "sizeref": 1, "sizemode": "area" }, "showlegend": False }] } return JsonResponse(response)
def main(data_df): for key in th_dict.keys(): if not key.find("hu") >0: data_df[key] = data_df[key].fillna(0) data_df[key] = data_df[key].map(lambda input:1 if input>=th_dict[key] else 0 ) add_DF = pd.DataFrame() add_DF["V-HU"]=data_df['HU_of_consolidation']+data_df['Volume_of_total_pneumonia_infection'] #0,1,2 all_data = pd.concat([ data_df["Duration"], data_df["Death"] , add_DF["V-HU"], ],axis=1) kmf = KaplanMeierFitter() T = all_data["Duration"] death = all_data['Death'] key_word = "V-HU" risk_level_0 = all_data[key_word] == 0 risk_level_1 = all_data[key_word] == 1 risk_level_2 = all_data[key_word] == 2 kmf.fit(T[risk_level_0], event_observed=death[risk_level_0], label='low risk') ax = kmf.plot() kmf.fit(T[risk_level_1], event_observed=death[risk_level_1], label='intermediate risk') ax = kmf.plot() kmf.fit(T[risk_level_2], event_observed=death[risk_level_2], label='high risk') kmf.plot(ax=ax) plt.legend(fontsize=7,loc='lower left') #kmf.plot() plt.ylabel('Survival Probability') plt.xlabel('Time since admission to death(days)') plt.text(37, 1, "Hazard ratio:",fontsize=8,style='italic') plt.text(37, 0.96, "low risk: reference",fontsize=8) plt.text(37, 0.92, "intermediate risk: 2,54; 95%CI, 1,44-4,49",fontsize=8) plt.text(37, 0.88, "high risk: 4,90; 95%CI, 2,78-8,64",fontsize=8) plt.text(12, 1, "p-value < 0,0001",fontsize=8,fontstyle='italic') #all data low_list = ['69','61','56','53','53','53','53','53'] medium_list = ['100','69','58','55','53','53','53','53'] high_list = ['69','37','27','26','21','21','20','20'] plt.text(-30, 0.005, "Numbers at low risk",fontsize=8) for i in range(len(low_list)): plt.text((i*10)-1, 0,low_list[i],fontsize=8) plt.text(-30, -0.035, "Numbers at intermediate risk",fontsize=8) for i in range(len(low_list)): plt.text((i*10)-1, -0.04,medium_list[i],fontsize=8) plt.text(-30, -0.075, "Numbers at high risk",fontsize=8) for i in range(len(low_list)): if len(high_list[i])==1: plt.text((i*10), -0.08,high_list[i],fontsize=8) else: plt.text((i*10)-1, -0.08,high_list[i],fontsize=8) plt.savefig("km_alldata_V-HU.pdf", bbox_inches='tight')
def mc_gformula_check(): df = load_sample_data(timevary=True) df['lag_art'] = df['art'].shift(1) df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art']) df['lag_cd4'] = df['cd4'].shift(1) df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4']) df['lag_dvl'] = df['dvl'].shift(1) df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl']) df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True) # age spline df['cd40_sq'] = df['cd40'] ** 2 # cd4 baseline cubic df['cd40_cu'] = df['cd40'] ** 3 df['cd4_sq'] = df['cd4'] ** 2 # cd4 current cubic df['cd4_cu'] = df['cd4'] ** 3 df['enter_sq'] = df['enter'] ** 2 # entry time cubic df['enter_cu'] = df['enter'] ** 3 g = MonteCarloGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out') exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.exposure_model(exp_m, restriction="g['lag_art']==0") out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.outcome_model(out_m, restriction="g['drop']==0") dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary') cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") g.add_covariate_model(label=2, covariate='cd4', model=cd4_m,recode=cd4_recode_scheme, var_type='continuous') g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=10000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) gf = g.predicted_outcomes kmn = KaplanMeierFitter() kmn.fit(durations=gf['out'], event_observed=gf['dead']) kmo = KaplanMeierFitter() kmo.fit(durations=df['out'], event_observed=df['dead'], entry=df['enter']) cens_m = """male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu""" g.censoring_model(cens_m) g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=10000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) gf = g.predicted_outcomes kmc = KaplanMeierFitter() kmc.fit(durations=gf['out'], event_observed=gf['dead']) plt.step(kmn.event_table.index, 1 - kmn.survival_function_, c='g', where='post', label='Natural') plt.step(kmn.event_table.index, 1 - kmc.survival_function_, c='orange', where='post', label='Censor') plt.step(kmo.event_table.index, 1 - kmo.survival_function_, c='k', where='post', label='True') plt.legend() plt.show()
def kmf(self): return KaplanMeierFitter()
def compute_cens_surv(self): # inverse KMF kmf = KaplanMeierFitter() kmf.fit(self.df['TIME'], event_observed=(1-self.df['EVENT'])) self.cens_surv = kmf.survival_function_.rename_axis('TIME').rename(columns={'KM_estimate': 'CENS_SURV'}) self.df = self.df.merge(self.cens_surv.reset_index(), how='left', on='TIME')
def kaplan_meier( file, model=None, cohorts=["UKDP"], event_type="biochemicalRecurrence", event_time="bcrTime", figsize=(9, 6), ): if isinstance(cohorts, str): cohorts = [cohorts] if model is None: md = mc_model() else: md = model valid = np.logical_and( ~md.pheno.loc[:, [event_type, event_time]].isna().any(axis=1), md.pheno["blacklisted"] == 0, ) chs = pd.Series(cohorts).str.upper() ind = np.logical_and(md.pheno["CohortAbb"].str.upper().isin(chs), valid) mpheno = md.pheno.loc[ind, :].copy() if file.endswith(".tsv"): # this is a score file score_df = pd.read_csv(file, delimiter="\t", index_col="ID") score = score_df.loc[mpheno.index, "score"] ind[ind] = ~score.isna() score = score[~score.isna()].values mpheno = md.pheno.loc[ind, :].copy() else: pars = get_params(file) if "logHR" not in pars["means"]: raise TypeError( "The parameter in the file do not seem to contain hazard " "prediction.") expressions = np.concatenate( [pars["means"]["x_t"][ind, :], pars["means"]["x_f"][ind, :]], axis=1, ) score = np.dot(expressions, pars["means"]["logHR"])[:, 0] event = md.pheno.loc[ind, event_type].values time = md.pheno.loc[ind, event_time].values / 365.25 # Grouping threshold = np.median(score) grouping = score > threshold g1 = grouping g2 = ~grouping # Kaplan Mayer Plot kmfh = KaplanMeierFitter() kmfh.fit(time[g1], event[g1], label="High Hazard") figure = kmfh.plot(figsize=figsize) kmfl = KaplanMeierFitter() kmfl.fit(time[g2], event[g2], label="Low Hazard") figure = kmfl.plot(ax=figure) plt.xlabel("years") add_at_risk_counts(kmfh, kmfl, ax=figure) # Cox Regression mpheno["score"] = score cph = CoxPHFitter() cph.fit(mpheno, duration_col=event_time, event_col=event_type, formula="score") # logrank test logr = statistics.logrank_test( mpheno.loc[g1, event_time], mpheno.loc[g2, event_time], mpheno.loc[g1, event_type], mpheno.loc[g2, event_type], ) print("Cohorts: {}, event: {}, time: {}".format(cohorts, event_type, event_time)) print("Concordance: {:.2%}".format(cph.concordance_index_)) print("Cox p-value: {}".format(cph.summary.loc["score", "p"])) print("Logrank p-value: {}".format(logr.p_value)) return figure, cph, logr
""" Created on Mon May 18 20:32:10 2020 @author: DESHMUKH SURVIVAL ANALYSIS """ # pip install lifelines import pandas as pd from lifelines import KaplanMeierFitter # ========================================================================================== # Business Problem - Perform Kaplan meir analysis for the given data and get the life table. # ========================================================================================== patient = pd.read_csv('Patient.csv') patient.head() patient.info() # Summary patient.describe() # Initiating the KaplanMeierFitter model kmf = KaplanMeierFitter(label='FollowUps vs Event') # Fitting KaplanMeierFitter model on Followups and Event type kmf.fit(patient.Followup, patient.Eventtype) # fit(time,events) # Time-line estimations plot kmf.plot(color='g') # ---------------------------------------------------- #
data = pd.DataFrame(data) duration = data['span'] observed = data.ix[:, 'censor'] #kmf = KaplanMeierFitter() #kmf.fit(duration,observed,label='kmf_mean') #kmf.plot() #plt.show() ##atleast 50 innings playe data['runs'] = pd.to_numeric(data['runs']) runs8000 = data.ix[data['runs'] >= 8000] runs3000 = data.ix[data['runs'] <= 3000] #runs3000 = runs3000.ix[runs3000['runs']< 4000] kmfruns8000 = KaplanMeierFitter() kmfruns8000.fit(runs8000['span'], runs8000['censor'], label=' runs > 8000') kmfruns3000 = KaplanMeierFitter() kmfruns3000.fit(runs3000['span'], runs3000['censor'], label=' runs < 3000') bx = plt.subplot(111) kmfruns8000.survival_function_.plot(ax=bx) kmfruns3000.survival_function_.plot(ax=bx) plt.xlabel(" career length ( in years )") plt.ylabel(" probability of players ") plt.title("probability of players with specific runs vs their career length") plt.show()
def test_qth_survival_time_accepts_a_model(): kmf = KaplanMeierFitter().fit([1.0, 0.7, 0.6]) assert utils.qth_survival_time(0.8, kmf) > 0
def estimate_kaplan_meier(self): labels = self.survival_label[ 'label'] # 将data_label的DataFrame格式转化为Series格式 sfs = {} # 画生存曲线图 # plt.figure(1) ax = plt.subplot() fitter = [] for label in sorted(labels.unique()): data_label_index = list( set(labels[labels == label].index) & set(self.survival_label.index)) kmf = KaplanMeierFitter() kmf.fit(self.survival_label.loc[data_label_index][ self.duration_column], self.survival_label.loc[data_label_index][ self.observed_column], label=label) # 将每一个训练的kmf放入fitter中存储,用于画出每个标签的对应的时间的生存人数 fitter.append(kmf) sfs[label] = kmf.survival_function_ # 得到每个标签的生存率 self.median_survival_time[label] = kmf.median_ ax = kmf.plot(ax=ax) # 画生存曲线图 # 画对应时间的生存人数 add_at_risk_counts(*fitter) # 计算log_rank值看分组的生存差异是否显著 self.test_statistic, self.p_value = multivariate_logrank_test( self.survival_label, labels) if self.p_value == 0: self.p_value = '< 0.0001' p_transform = True else: self.p_value = str(self.p_value) p_transform = False # 输出所有组的生存率 self.survival_rate_result = pd.concat( [sfs[k] for k in list(sorted(labels.unique()))], axis=1).interpolate() if len(self.CI) > 0: # 在图中显示log_rank中p值 if p_transform == False: ax.text(0.35, 0.8, 'log_rank p=%s' % self.p_value, transform=ax.transAxes, va='top', fontsize=12) ax.text(0.35, 0.9, "HR=%.3f(95%% CI:%.3f-%.3f)" % (self.HR, self.CI[0], self.CI[1]), transform=ax.transAxes, va='top', fontsize=12) else: ax.text(0.35, 0.8, 'log_rank p %s' % self.p_value, transform=ax.transAxes, va='top', fontsize=12) ax.text(0.35, 0.9, "HR=%.3f(95%% CI:%.3f-%.3f)" % (self.HR, self.CI[0], self.CI[1]), transform=ax.transAxes, va='top', fontsize=12) else: # 在图中显示log_rank中p值 ax.text(0.35, 0.8, 'log_rank p=%s' % self.p_value, transform=ax.transAxes, va='top', fontsize=12) plt.title('Full Data') print("Median survival time of data: %s" % self.median_survival_time) plt.show()
dict(selector="td", props=[('padding', "0em 0em")]), dict(selector="th:hover", props=[("font-size", "12pt")]), dict(selector="tr:hover td:hover", props=[('max-width', '200px'), ('font-size', '12pt')]) ] corr.style.background_gradient(cmap, axis=1)\ .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\ .set_caption("Hover to magify")\ .set_precision(2)\ .set_table_styles(magnify()) plt.show() """ #Fitting the model using KaplanMeiler """ kmf = KaplanMeierFitter() kmf.fit(durations=data['Duration'], event_observed=data['Divorce']) #Plotting survival function kmf.survival_function_.plot(title='Marriage Survival Time in the U.S', legend=False, linewidth=3.0) plt.savefig('/home/raed/Dropbox/INSE - 6320/Final Project/Survival.pdf') plt.show() kmf.plot( title='Survival Time Estimates of Mariages and its Confidence Intervals', legend=False, linewidth=3.0, show_censors=True) #Export the figure plt.savefig( '/home/raed/Dropbox/INSE - 6320/Final Project/Survival_ConfidenceInterval.pdf'
def km_curve(labels_ids, survival_dataset, tested_gene_expression_headers_columns, gene_group, k=None, label_index=None): # ax = plt.subplot(111) flatten_set = set(y for x in labels_ids for y in x) dif = set(tested_gene_expression_headers_columns).difference(flatten_set) if len(dif) > 0: labels_ids.append(list(dif)) kmf = KaplanMeierFitter() all_labels = np.array([y for x in labels_ids for y in x]) label_event_list = [] label_duration_list = [] lr_results_global = None for i, cur_labels in enumerate(labels_ids): label_event = survival_dataset[ np.in1d(survival_dataset[:, 0], cur_labels) & np. in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 4].astype(np.int32) label_duration = survival_dataset[ np.in1d(survival_dataset[:, 0], cur_labels) & np. in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 3].astype(np.int32) label_event_list.append(label_event) label_duration_list.append(label_duration) labels_c = all_labels[ ~np.in1d(all_labels, cur_labels) & np.in1d(all_labels, tested_gene_expression_headers_columns)] label_event_c = survival_dataset[ np.in1d(survival_dataset[:, 0], labels_c), 4].astype(np.int32) label_duration_c = survival_dataset[ np.in1d(survival_dataset[:, 0], labels_c), 3].astype(np.int32) # print labels_ids # print survival_dataset # print "{}_{}_{}_{}".format(len(label_duration),len(label_duration_c),len(label_event),len(label_event_c)) lr_results_global = logrank_test(label_duration, label_duration_c, label_event, label_event_c, alpha=.95).p_value if len(label_duration) != 0: kmf.fit(list(label_duration), event_observed=list(label_event), label="cluster {} n={}, logrank pval = {}".format( i, len(label_duration), '{0:1.3e}'.format(lr_results_global))) # '%.7f' % # kmf.plot(ax=ax, show_censors=True) print "lrank cluster {} vs all: {}".format(i, lr_results_global) for j, cur_duration in enumerate(label_duration_list[:-1]): lr_results = logrank_test(label_duration, label_duration_list[j], label_event, label_event_list[j], alpha=.95).p_value print "lrank cluster {} vs cluster {}: {}".format( i, j, lr_results) # plt.ylim(0, 1); # plt.title("clustering survival analysis"); # plt.savefig(os.path.join(constants.BASE_PROFILE,"output" ,"cluster_by_p_{}_{}_k={}_label_i={}_{}.png".format(constants.CANCER_TYPE, gene_group,k,label_index , time.time()))) # plt.cla() return lr_results_global
med_50k = (dataset["Median household income inflation adj to 2018"] == "$50,000 - $54,999") med_45k = (dataset["Median household income inflation adj to 2018"] == "$45,000 - $49,999") med_40k = (dataset["Median household income inflation adj to 2018"] == "$40,000 - $44,999") med_35k = (dataset["Median household income inflation adj to 2018"] == "$35,000 - $39,999") med_35k_minus = (dataset["Median household income inflation adj to 2018"] == "< $35,000") obs = dataset["SEER cause-specific death classification"] lb = LabelBinarizer() obs = lb.fit_transform(obs) durations = dataset['Survival months'] kmf1 = KaplanMeierFitter() kmf1.fit(durations[med_75k_plus], event_observed=obs[med_75k_plus], label="75,000+") kmf1.plot(ax=ax) kmf2 = KaplanMeierFitter() kmf2.fit(durations[med_50k], event_observed=obs[med_50k], label="50,000-55,000") kmf2.plot(ax=ax) kmf3 = KaplanMeierFitter() kmf3.fit(durations[med_35k_minus], event_observed=obs[med_35k_minus], label="<35,000")
merged = data.set_index('company').join(companies.set_index('company')) data_not_empty = merged.copy() data_not_empty = data_not_empty.dropna() countries = ['NA', 'SA', 'AF', 'OC', 'EU', 'ME'] fig = plt.figure(figsize=(10, 12)) plt.subplots_adjust(hspace=0.4) for i, continent in enumerate(countries): continent_data = data_not_empty[data_not_empty['continent'] == continent] other_data = data_not_empty[data_not_empty['continent'] != continent] kmf_continent = KaplanMeierFitter() kmf_continent.fit(continent_data['duration'], continent_data['observed']) kmf_other = KaplanMeierFitter() kmf_other.fit(other_data['duration'], other_data['observed']) ax = fig.add_subplot(3, 2, i + 1) ax.set_title('{} vs other'.format(continent)) kmf_continent.plot_loglogs(ax=ax, label=continent) kmf_other.plot_loglogs(ax=ax, label='other') fig.show() data_financial = merged[merged['branch'] == 'F'] data_other = merged[merged['branch'] == 'O']
def plot2(df): survival = df kmf = KaplanMeierFitter() ax = plt.subplot(111) stacks = ['water', 'Val1000'] stacks_graph = survival.loc[survival['group'].isin(stacks)] for name, grouped_survival in stacks_graph.groupby('group'): kmf.fit(grouped_survival['time'], grouped_survival['event'], label=name) kmf.plot(ax=ax, ci_show=False, marker='o') plt.xlabel("days") plt.ylabel("survival proportion") plt.ylim(-0.05, 1.05) #%% # subset the dataset into Val50 treatments and water and plot kmf = KaplanMeierFitter() ax = plt.subplot(111) val = ['water', 'Ryan'] val_graph = survival.loc[survival['group'].isin(val)] for name, grouped_survival in val_graph.groupby('group'): kmf.fit(grouped_survival['time'], grouped_survival['event'], label=name) kmf.plot(ax=ax, ci_show=False, marker='o') plt.xlabel("days") plt.ylabel("survival proportion") plt.ylim(-0.05, 1.05) #%% # subset the dataset by pumice formulations and plot kmf = KaplanMeierFitter() ax = plt.subplot(111) pumice = ['water', 'BBG'] pum_graph = survival.loc[survival['group'].isin(pumice)] for name, grouped_survival in pum_graph.groupby('group'): kmf.fit(grouped_survival['time'], grouped_survival['event'], label=name) kmf.plot(ax=ax, ci_show=False, marker='o') plt.xlabel("days") plt.ylabel("survival proportion") plt.ylim(-0.05, 1.05) return fig_to_uri(plt)
def plot(out, fontsize=12, savepath='', width=10, height=6, cmap='Set1', cii_alpha=0.05, cii_lines='dense', methodtype='lifeline', title='Survival function', full_ylim=False, y_percentage=False): """Make plot. Parameters ---------- out : dict Results from the fit function. fontsize : int, optional Font size for the graph. The default is 12. savepath : String, optional Path to store the figure. The default is ''. width : int, optional Width of the figure. The default is 10. height : int, optional height of the figure. The default is 6. cmap : String, optional Specify your own colors for each class-label or use a colormap: https://matplotlib.org/examples/color/colormaps_reference.html. The default is 'Set1'. [(1, 0, 0),(0, 0, 1),(..)] 'Set1' (default) 'Set2' Discrete colors 'Pastel1' Discrete colors 'Paired' Discrete colors 'rainbow' 'bwr' Blue-white-red 'binary' or 'binary_r' 'seismic' Blue-white-red 'Blues' white-to-blue 'Reds' white-to-red cii_alpha : float, optional Confidence interval (works only when methodtype='lifelines'). The default is 0.05. cii_lines : String, optional Confidence lines (works only when methodtype='lifelines'). The default is 'dense'. 'lifelines' (default) 'custom' methodtype : String, optional Implementation type. The default is 'lifeline'. 'dense' (dense/filled lines) 'line' None (no lines) title : TYPE, optional DESCRIPTION. The default is 'Survival function'. Returns ------- None. """ KMcoord = {} Param = {} Param['width'] = width Param['height'] = height Param['fontsize'] = fontsize Param['savepath'] = savepath labx = out['labx'] # Combine data and gather class labels data = np.vstack((out['time_event'], out['censoring'])).T # Make colors and legend-names for class-labels [class_colors, classlabel] = make_class_color_names(data, out['labx'], out['uilabx'], cmap=cmap) if methodtype == 'lifeline': # Init kmf_all = [] # Startup figure fig = plt.figure(figsize=(Param['width'], Param['height'])) ax = fig.add_subplot(111) if full_ylim: ax.set_ylim([0.0, 1.05]) if y_percentage: ax.yaxis.set_major_formatter(PercentFormatter(1.0)) if out['logrank'] != []: plt.title('%s, Logrank Test P-Value = %.5f' % (title, out['logrank_P'])) # Compute KM survival coordinates per class if cii_lines == 'dense': cii_lines = False if cii_lines == 'line': cii_lines = True if cii_lines == '' or cii_lines == None or cii_alpha == None: cii_lines = False cii_alpha = 0 for i in range(0, len(out['uilabx'])): kmf = KaplanMeierFitter() idx = np.where(labx == out['uilabx'][i])[0] # Fit kmf.fit(out['time_event'][idx], event_observed=out['censoring'][idx], label=classlabel[i], ci_labels=None, alpha=(1 - cii_alpha)) # Plot kmf.plot(ax=ax, ci_force_lines=cii_lines, color=class_colors[i], show_censors=True) # Store kmf_all.append( kmf.fit(out['time_event'][idx], event_observed=out['censoring'][idx], label=classlabel[i], ci_labels=None, alpha=(1 - cii_alpha))) add_at_risk_counts(*kmf_all, ax=ax) ax.tick_params(axis='x', length=15, width=1, direction='out', labelsize=Param['fontsize']) ax.tick_params(axis='y', length=15, width=1, direction='out', labelsize=Param['fontsize']) ax.spines['bottom'].set_position(['outward', Param['fontsize']]) ax.spines['left'].set_position(['outward', Param['fontsize']]) # ax.rc('font', size= Param['fontsize']) # controls default text sizes # ax.rc('axes', labelsize = Param['fontsize']) # fontsize of the x and y labels if Param['savepath'] != '': savefig(fig, Param['savepath']) if methodtype == 'custom': # Compute KM survival coordinates per class for i in range(0, len(out['uilabx'])): idx = np.where(labx == out['uilabx'][i])[0] tmpdata = data[idx, :].tolist() KMcoord[i] = compute_coord(tmpdata) # Plot KM survival lines plotkm(KMcoord, classlabel, cmap=class_colors, width=Param['width'], height=Param['height'], fontsize=Param['fontsize'])
out = km.fit(time_event, censoring, labx) # Direct grouped lines km.plot(out) # %% [markdown] # # Kaplan-Meier curve using _lifelines_ # # We can have greater control over the KM curve using the _lifelines_ package. This package follows the # coding style of packages like _scikit-learn_ in that we first start a fitting object, then we # fit the model to the data and then plot it # # This plotting uses a matplotlib backend # %% lifelines from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() # Kaplan Meier fitting object kmf.fit(time_event, event_observed=censoring) # Fit model to data kmf.plot(at_risk_counts=False) # No table at first plt.title('Kaplan-Meier Curve') plt.show(); # %% [markdown] # # Kaplan-Meier curve using _lifelines_ # # We can clean this curve up a bit. # %% Kaplan-Meier via lifelines ax = kmf.plot() ax.set_xlabel('days') ax.set_ylabel('Probability of survival')
import os import sys import numpy as np from select_common import select_segments_broad, read_annotations from lifelines import KaplanMeierFitter import pickle data_dir = os.path.normpath(os.path.join(sys.path[0], '../data-new/')) out_file = os.path.join(data_dir, 'interim', 'survival-model.pickle') annotations = read_annotations(data_dir) durations = [] observeds = [] for annotation in annotations.values(): (indices, Y) = select_segments_broad(annotation) for section in Y[:-1]: durations += section observeds += [True] * len(section) durations += Y[-1] observeds += [False] * len(Y[-1]) km = KaplanMeierFitter() km = km.fit(durations, event_observed=observeds) with open(out_file, 'wb') as out_handle: pickle.dump(km, out_handle)
####EDIT THIS TO CHANGE WHICH SEASON YOU TRAIN THE MODEL ON##### trainingSeasons = [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12] trainData = allSeasonsByEpisode.loc[allSeasonsByEpisode["Season"].isin( trainingSeasons)] kmfdata = pd.DataFrame({ 'Age': allSeasonsByEpisode.groupby(['ID']).Age.first(), 'Duration': allSeasonsByEpisode.groupby(['ID']).End.max(), 'Observed': allSeasonsByEpisode.groupby(['ID']).Out.sum() == 1 }) kmf = KaplanMeierFitter() kmf.fit(kmfdata["Duration"], event_observed=kmfdata["Observed"]) #Segment the surival curve based on age group ax = plt.subplot(111) age0 = kmfdata["Age"] < 30 age1 = (kmfdata["Age"] < 40) & (kmfdata["Age"] >= 30) age2 = kmfdata["Age"] >= 40 kmf.fit(kmfdata["Duration"][age0], event_observed=kmfdata["Observed"][age0], label="Age < 30") kmf.plot(ax=ax) kmf.fit(kmfdata["Duration"][age1], event_observed=kmfdata["Observed"][age1], label=" 30 <= Age < 40") kmf.plot(ax=ax)
def run_kaplan_meier(run_parameters): """ save the lifelines kaplan-meier graphical analysis and p-value to two files Args: run_parameters: with keys: results_directory phenotype_file_name (containing the following column names) cluster_id event_id time_id Returns: Writes: two time-stamped files named after the phenotype file and "kaplan-meier" "png" (640 x 480) image of the lifelines kaplan-meier graphical analysis one cell dataframe with the p-value of the multivariate logrank test """ results_directory = run_parameters['results_directory'] phenotype_file_name = run_parameters['phenotype_file_name'] cluster_id = run_parameters['cluster_id'] event_id = run_parameters['event_id'] time_id = run_parameters['time_id'] phenotype_df = kn.get_spreadsheet_df(phenotype_file_name) T = phenotype_df[time_id] C = phenotype_df[event_id] results = multivariate_logrank_test(T, phenotype_df[cluster_id], C, alpha=0.99) p_value = str('%g' % (results.p_value)) test_name = 'multivariate_logrank_test' Clusters = sorted(phenotype_df[cluster_id].unique()) num_clusters = len(Clusters) plt.clf() ax = plt.subplot(111) kmf = KaplanMeierFitter() for cluster in Clusters: ixc = phenotype_df[cluster_id] == cluster kmf.fit(T.ix[ixc], C.ix[ixc], label=cluster + 1) kmf.plot(ax=ax, show_censors=True, ci_show=False) plt.title('number of clusters = %s' % (num_clusters)) plt.xlabel('Time (days)') plt.ylabel('OS') transform_name = "kaplan_meier" kaplan_meier_spreadsheet_df = pd.DataFrame(data=p_value, index=[test_name], columns=['p_value']) write_transform_df(kaplan_meier_spreadsheet_df, phenotype_file_name, transform_name + '_p_value', results_directory) result_name = get_outfile_name(results_directory, phenotype_file_name, transform_name + '_graphic', file_ext='png') plt.savefig(result_name, dpi=100)
def multivariate_logrank_test( event_durations, groups, event_observed=None, t_0=-1, weightings=None, **kwargs) -> StatisticalResult: # pylint: disable=too-many-locals r""" This test is a generalization of the logrank_test: it can deal with n>2 populations (and should be equal when n=2): .. math:: \begin{align} & H_0: h_1(t) = h_2(t) = h_3(t) = ... = h_n(t) \\ & H_A: \text{there exist at least one group that differs from the other.} \end{align} Parameters ---------- event_durations: iterable a (n,) list-like representing the (possibly partial) durations of all individuals groups: iterable a (n,) list-like of unique group labels for each individual. event_observed: iterable, optional a (n,) list-like of event_observed events: 1 if observed death, 0 if censored. Defaults to all observed. t_0: float, optional (default=-1) the period under observation, -1 for all time. weightings: str, optional apply a weighted logrank test: options are "wilcoxon" for Wilcoxon (also known as Breslow), "tarone-ware" for Tarone-Ware, "peto" for Peto test and "fleming-harrington" for Fleming-Harrington test. These are useful for testing for early or late differences in the survival curve. For the Fleming-Harrington test, keyword arguments p and q must also be provided with non-negative values. Weightings are applied at the ith ordered failure time, :math:`t_{i}`, according to: Wilcoxon: :math:`n_i` Tarone-Ware: :math:`\sqrt{n_i}` Peto: :math:`\bar{S}(t_i)` Fleming-Harrington: :math:`\hat{S}(t_i)^p \times (1 - \hat{S}(t_i))^q` where :math:`n_i` is the number at risk just prior to time :math:`t_{i}`, :math:`\bar{S}(t_i)` is Peto-Peto's modified survival estimate and :math:`\hat{S}(t_i)` is the left-continuous Kaplan-Meier survival estimate at time :math:`t_{i}`. kwargs: add keywords and meta-data to the experiment summary. Returns ------- StatisticalResult a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary`` Examples -------- .. code:: python df = pd.DataFrame({ 'durations': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], 'events': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], 'groups': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2] }) result = multivariate_logrank_test(df['durations'], df['groups'], df['events']) result.test_statistic result.p_value result.print_summary() # numpy example G = [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2] T = [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7] E = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0] result = multivariate_logrank_test(T, G, E) result.test_statistic See Also -------- pairwise_logrank_test logrank_test """ kwargs.setdefault("test_name", "multivariate_logrank_test") event_durations, groups = np.asarray(event_durations), np.asarray(groups) if event_observed is None: event_observed = np.ones((event_durations.shape[0], 1)) else: event_observed = np.asarray(event_observed) n = np.max(event_durations.shape) assert n == np.max(event_durations.shape) == np.max( event_observed.shape), "inputs must be of the same length." groups, event_durations, event_observed = map( lambda x: pd.Series(np.asarray(x).reshape(n)), [groups, event_durations, event_observed]) unique_groups, rm, obs, _ = group_survival_table_from_events( groups, event_durations, event_observed, limit=t_0) n_groups = unique_groups.shape[0] # compute the factors needed n_ij = rm.sum(0).values - rm.cumsum(0).shift(1).fillna(0) d_i = obs.sum(1) n_i = rm.values.sum() - rm.sum(1).cumsum().shift(1).fillna(0) ev_i = n_ij.mul(d_i / n_i, axis="index") # compute weightings for log-rank alternatives if weightings is None: w_i = np.ones(d_i.shape[0]) elif weightings == "wilcoxon": kwargs["test_name"] = kwargs["test_name"].replace( "logrank", "Wilcoxon") w_i = n_i elif weightings == "tarone-ware": kwargs["test_name"] = kwargs["test_name"].replace( "logrank", "Tarone-Ware") w_i = np.sqrt(n_i) elif weightings == "peto": kwargs["test_name"] = kwargs["test_name"].replace("logrank", "Peto") w_i = np.cumprod(1.0 - (ev_i.sum(1)) / (n_i + 1)) # Peto-Peto's modified survival estimates. elif weightings == "fleming-harrington": if "p" in kwargs: p = kwargs["p"] if p < 0: raise ValueError("p must be non-negative.") else: raise ValueError( "Must provide keyword argument p for Flemington-Harrington test statistic" ) if "q" in kwargs: q = kwargs["q"] if q < 0: raise ValueError("q must be non-negative.") else: raise ValueError( "Must provide keyword argument q for Flemington-Harrington test statistic" ) kwargs["test_name"] = kwargs["test_name"].replace( "logrank", "Flemington-Harrington") kmf = KaplanMeierFitter().fit(event_durations, event_observed=event_observed) s = kmf.survival_function_.to_numpy().flatten( )[:-1] # Left-continuous Kaplan-Meier survival estimate. w_i = np.power(s, p) * np.power(1.0 - s, q) else: raise ValueError("Invalid value for weightings.") # apply weights to observed and expected N_j = obs.mul(w_i, axis=0).sum(0).values ev = ev_i.mul(w_i, axis=0).sum(0) # vector of observed minus expected Z_j = N_j - ev assert abs(Z_j.sum( )) < 10e-8, "Sum is not zero." # this should move to a test eventually. # compute covariance matrix factor = (((n_i - d_i) / (n_i - 1)).replace([np.inf, np.nan], 1)) * d_i / n_i**2 n_ij["_"] = n_i.values V_ = (n_ij.mul(w_i, axis=0)).mul(np.sqrt(factor), axis="index").fillna(0) # weighted V_ V = -np.dot(V_.T, V_) ix = np.arange(n_groups) V[ix, ix] = V[ix, ix] - V[-1, ix] V = V[:-1, :-1] # take the first n-1 groups U = Z_j.iloc[:-1] @ np.linalg.pinv( V[:-1, :-1]) @ Z_j.iloc[:-1] # Z.T*inv(V)*Z # compute the p-values and tests p_value = _chisq_test_p_value(U, n_groups - 1) return StatisticalResult(p_value, U, t_0=t_0, null_distribution="chi squared", degrees_of_freedom=n_groups - 1, **kwargs)
def plot_survival(self): df = super().load_data( col=[ 'YR_BRTH', 'AGE_DX', 'LATERAL', 'RADIATN', 'HISTREC', 'ERSTATUS', 'PRSTATUS', 'BEHANAL', 'HST_STGA', 'NUMPRIMS', 'SRV_TIME_MON', 'SRV_TIME_MON_PA', 'DTH_CLASS', 'O_DTH_CLASS', 'STAT_REC' ], cond= 'SRV_TIME_MON < 1000 AND HST_STGA < 8 AND DTH_CLASS < 9 AND ERSTATUS < 4 AND PRSTATUS < 4', sample_size=100000) kmf = KaplanMeierFitter() try: df.RADIATN = df.RADIATN.replace(7, 0) df = df[df.RADIATN < 7] except Exception as err: pass # 0-negative, 1-borderline,, 2-positive df = df[df.ERSTATUS != 4] df = df[df.ERSTATUS != 9] df.ERSTATUS = df.ERSTATUS.replace(2, 0) df.ERSTATUS = df.ERSTATUS.replace(1, 2) df.ERSTATUS = df.ERSTATUS.replace(3, 1) # 0-negative, 1-borderline,, 2-positive df = df[df.PRSTATUS != 4] df = df[df.PRSTATUS != 9] df.PRSTATUS = df.PRSTATUS.replace(2, 0) df.PRSTATUS = df.PRSTATUS.replace(1, 2) df.PRSTATUS = df.PRSTATUS.replace(3, 1) rad = df.RADIATN > 0 er = df.ERSTATUS > 0 pr = df.PRSTATUS > 0 st0 = df.HST_STGA == 0 st1 = df.HST_STGA == 1 st2 = df.HST_STGA == 2 st4 = df.HST_STGA == 4 age = df.AGE_DX < 50 #print(df.head()) #print(rad.head()) #print(er.head()) #print(st.head()) df['SRV_TIME_YR'] = df['SRV_TIME_MON'] / 12 T = df['SRV_TIME_YR'] #C = (np.logical_or(df.DTH_CLASS == 1, df.O_DTH_CLASS == 1)) C = df.STAT_REC == 4 #print(T.head(20)) #print(C.head(20)) #print(df.DTH_CLASS.head(20)) #print(df.O_DTH_CLASS.head(20)) #print(df.describe()) f, ax = plt.subplots(5, sharex=True, sharey=True) ax[0].set_title("Lifespans of cancer patients") # radiation kmf.fit(T[rad], event_observed=C[rad], label="Radiation") kmf.plot(ax=ax[0]) #, ci_force_lines=True) kmf.fit(T[~rad], event_observed=C[~rad], label="No Radiation") kmf.plot(ax=ax[0]) #, ci_force_lines=True) # ER Status kmf.fit(T[er], event_observed=C[er], label="ER Positive") kmf.plot(ax=ax[1]) #, ci_force_lines=True) kmf.fit(T[~er], event_observed=C[~er], label="ER Negative") kmf.plot(ax=ax[1]) #, ci_force_lines=True) # PR Status kmf.fit(T[pr], event_observed=C[pr], label="PR Positive") kmf.plot(ax=ax[2]) #, ci_force_lines=True) kmf.fit(T[~pr], event_observed=C[~pr], label="PR Negative") kmf.plot(ax=ax[2]) #, ci_force_lines=True) # stage kmf.fit(T[st0], event_observed=C[st0], label="Stage 0") kmf.plot(ax=ax[3]) #, ci_force_lines=True) kmf.fit(T[st1], event_observed=C[st1], label="Stage 1") kmf.plot(ax=ax[3]) #, ci_force_lines=True) kmf.fit(T[st2], event_observed=C[st2], label="Stage 2") kmf.plot(ax=ax[3]) #, ci_force_lines=True) kmf.fit(T[st4], event_observed=C[st4], label="Stage 4") kmf.plot(ax=ax[3]) #, ci_force_lines=True) # age kmf.fit(T[age], event_observed=C[age], label="Age < 50") kmf.plot(ax=ax[4]) #, ci_force_lines=True) kmf.fit(T[~age], event_observed=C[~age], label="Age >= 50") kmf.plot(ax=ax[4]) #, ci_force_lines=True) ax[0].legend(loc=3, prop={'size': 10}) ax[1].legend(loc=3, prop={'size': 10}) ax[2].legend(loc=3, prop={'size': 10}) ax[3].legend(loc=3, prop={'size': 10}) ax[4].legend(loc=3, prop={'size': 10}) ax[len(ax) - 1].set_xlabel('Survival in years') f.text(0.04, 0.5, 'Survival %', va='center', rotation='vertical') plt.tight_layout() plt.ylim(0, 1) plt.show() f, ax = plt.subplots(2, sharex=True, sharey=True) df.hist('SRV_TIME_YR', by=df.STAT_REC != 4, ax=(ax[0], ax[1])) ax[0].set_title('Histogram of Non Censored Patients') ax[0].set_ylabel('Number of Patients') ax[1].set_ylabel('Number of Patients') ax[1].set_title('Histogram of Censored Patients') ax[1].set_xlabel('Survival in Years') plt.show() return # second plot of survival fig, ax = plt.subplots(figsize=(8, 6)) cen = df[df.STAT_REC != 4].SRV_TIME_MON nc = df[df.STAT_REC == 4].SRV_TIME_MON cen = cen.sort_values() nc = nc.sort_values() ax.hlines([x for x in range(len(nc))], 0, nc, color='b', label='Uncensored') ax.hlines([x for x in range(len(nc), len(nc) + len(cen))], 0, cen, color='r', label='Censored') ax.set_xlim(left=0) ax.set_xlabel('Months') ax.set_ylim(-0.25, len(df) + 0.25) ax.legend(loc='best') plt.show() return