def score(T_actual, labels, E_actual): ''' Return a score based on grouping ''' scores = [] labels = labels.ravel() for g in ['high', 'mid', 'low']: members = labels == g if np.sum(members) > 0: kmf = KaplanMeierFitter() kmf.fit(T_actual[members], E_actual[members], label='{}'.format(g)) # Last survival time if np.sum(E_actual[members]) > 0: lasttime = np.max(T_actual[members][E_actual[members] == 1]) else: lasttime = np.nan # End survival rate, median survival time, member count, last event subscore = (kmf.survival_function_.iloc[-1, 0], median_survival_times(kmf.survival_function_), np.sum(members), lasttime) else: # Rpart might fail in this respect subscore = (np.nan, np.nan, np.sum(members), np.nan) scores.append(subscore) return scores
def test_passing_in_left_censorship_creates_a_cumulative_density(self, sample_lifetimes): T, C = sample_lifetimes kmf = KaplanMeierFitter() kmf.fit(T, C, left_censorship=True) assert hasattr(kmf, 'cumulative_density_') assert hasattr(kmf, 'plot_cumulative_density_') assert not hasattr(kmf, 'survival_function_')
def test_stat_error_is_raised_if_too_few_early_deaths(self): observations = np.array([1, 1, 1, 22, 30, 28, 32, 11, 14, 36, 31, 33, 33, 37, 35, 25, 31, 22, 26, 24, 35, 34, 30, 35, 40, 39, 2]) births = observations - 1 kmf = KaplanMeierFitter() with pytest.raises(StatError): kmf.fit(observations, entry=births)
def plot_KM(stime, censor, g1, pval, figname): sns.set_style('white') kmf = KaplanMeierFitter() f, ax = plt.subplots(figsize=(3, 3)) np.set_printoptions(precision=2, suppress=False) kmf.fit(stime[g1], event_observed=censor[g1], label=["high-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) kmf.fit(stime[~g1], event_observed=censor[~g1], label=["low-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) ax.grid(b=False) sns.despine() plt.ylim(0, 1) plt.xlabel("time", fontsize=14) plt.ylabel("survival", fontsize=14) plt.text(0.7, 0.85, 'pval = %.2e' % (pval), fontdict={'size': 12}, horizontalalignment='center', verticalalignment='center', transform=ax.transAxes) plt.xticks(rotation=45) for item in (ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(10) plt.tight_layout() plt.savefig(figname, format='eps') plt.close()
def test_kmf_with_risk_counts(self, block): data1 = np.random.exponential(10, size=(100)) kmf = KaplanMeierFitter() kmf.fit(data1) kmf.plot(at_risk_counts=True) self.plt.title("test_kmf_with_risk_counts") self.plt.show(block=block)
def test_predict_method_returns_exact_value_if_given_an_observed_time( self): T = [1, 2, 3] kmf = KaplanMeierFitter() kmf.fit(T) time = 1 assert abs(kmf.predict(time) - kmf.survival_function_.ix[time].values) < 10e-8
def test_show_censor_with_discrete_date(self, block): T = np.random.binomial(20, 0.1, size=100) C = np.random.binomial(1, 0.8, size=100) kmf = KaplanMeierFitter() kmf.fit(T, C).plot(show_censors=True) self.plt.title('test_show_censor_with_discrete_date') self.plt.show(block=block) return
def test_show_censor_with_index_0(self, block): T = np.random.binomial(20, 0.9, size=100) # lifelines should auto put a 0 in. C = np.random.binomial(1, 0.8, size=100) kmf = KaplanMeierFitter() kmf.fit(T, C).plot(show_censors=True) self.plt.title('test_show_censor_with_index_0') self.plt.show(block=block) return
def kaplanMeier(self): from lifelines.estimation import KaplanMeierFitter df = self.inputDf self.kmf = KaplanMeierFitter() time = df[self.eventTime].dt.days status = df[self.censorVar] lab = self.label self.kmf.fit(time, event_observed=status, label=lab)
def test_kmf_minimum_observation_bias(): N = 250 kmf = KaplanMeierFitter() T, C = exponential_survival_data(N, 0.1, scale=10) B = 0.01 * T kmf.fit(T, C, entry=B) kmf.plot() plt.title("Should have larger variances in the tails")
def test_flat_style_no_censor(self, block): data1 = np.random.exponential(10, size=200) kmf = KaplanMeierFitter() kmf.fit(data1, label='test label 1') ax = kmf.plot(flat=True, censor_styles={'marker': '+', 'mew': 2, 'ms': 7}) self.plt.title('test_flat_style_no_censor') self.plt.show(block=block) return
def test_kmf_left_censorship_stats(self): # from http://www.public.iastate.edu/~pdixon/stat505/Chapter%2011.pdf T = [3, 5, 5, 5, 6, 6, 10, 12] C = [1, 0, 0, 1, 1, 1, 0, 1] kmf = KaplanMeierFitter() kmf.fit(T, C, left_censorship=True) actual = kmf.cumulative_density_[kmf._label].values npt.assert_almost_equal(actual, np.array([0, 0.437500, 0.5833333, 0.875, 0.875, 1]))
def test_negative_times_still_plots(self, block): n = 40 T = np.linspace(-2, 3, n) C = np.random.randint(2, size=n) kmf = KaplanMeierFitter() kmf.fit(T, C) ax = kmf.plot() self.plt.title('test_negative_times_still_plots') self.plt.show(block=block) return
def kmf_calculation(df, bucket): indices_ = np.where(df.use_buckets == bucket) T = df['duration'].iloc[indices_] C = df['churn'].iloc[indices_] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=C, label=bucket) return kmf
def test_kmf_survival_curve_output_against_R(self): df = load_g3() ix = df['group'] == 'RIT' kmf = KaplanMeierFitter() expected = np.array([[0.909, 0.779]]).T kmf.fit(df.ix[ix]['time'], df.ix[ix]['event'], timeline=[25, 53]) npt.assert_array_almost_equal(kmf.survival_function_.values, expected, decimal=3) expected = np.array([[0.833, 0.667, 0.5, 0.333]]).T kmf.fit(df.ix[~ix]['time'], df.ix[~ix]['event'], timeline=[9, 19, 32, 34]) npt.assert_array_almost_equal(kmf.survival_function_.values, expected, decimal=3)
def test_shifting_durations_doesnt_affect_survival_function_values(self): T = np.random.exponential(10, size=100) kmf = KaplanMeierFitter() expected = kmf.fit(T).survival_function_.values T_shifted = T + 100 npt.assert_almost_equal(expected, kmf.fit(T_shifted).survival_function_.values) T_shifted = T - 50 npt.assert_almost_equal(expected[1:], kmf.fit(T_shifted).survival_function_.values) T_shifted = T - 200 npt.assert_almost_equal(expected[1:], kmf.fit(T_shifted).survival_function_.values)
def test_kmf_confidence_intervals_output_against_R(self): # this uses conf.type = 'log-log' df = load_g3() ix = df['group'] != 'RIT' kmf = KaplanMeierFitter() kmf.fit(df.ix[ix]['time'], df.ix[ix]['event'], timeline=[9, 19, 32, 34]) expected_lower_bound = np.array([0.2731, 0.1946, 0.1109, 0.0461]) npt.assert_array_almost_equal(kmf.confidence_interval_['KM_estimate_lower_0.95'].values, expected_lower_bound, decimal=3) expected_upper_bound = np.array([0.975, 0.904, 0.804, 0.676]) npt.assert_array_almost_equal(kmf.confidence_interval_['KM_estimate_upper_0.95'].values, expected_upper_bound, decimal=3)
class survModel(): def __init__(self, inputDf, eventTime, censorVar, label): self.inputDf = inputDf self.eventTime = eventTime self.censorVar = censorVar self.label = label def kaplanMeier(self): from lifelines.estimation import KaplanMeierFitter df = self.inputDf self.kmf = KaplanMeierFitter() time = df[self.eventTime].dt.days status = df[self.censorVar] lab = self.label self.kmf.fit(time, event_observed=status, label=lab)
def test_seaborn_doesnt_cause_kmf_plot_error(self, block, kmf, capsys): import seaborn as sns df = load_waltons() T = df['T'] E = df['E'] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) kmf.plot() self.plt.title('test_seaborn_doesnt_cause_kmf_plot_error') self.plt.show(block=block) _, err = capsys.readouterr() assert err == ""
def test_kmf_left_censorship_plots(self, block): kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.loc[lcd_dataset['group'] == 'alluvial_fan'] basin_trough = lcd_dataset.loc[lcd_dataset['group'] == 'basin_trough'] kmf.fit(alluvial_fan['T'], alluvial_fan['C'], left_censorship=True, label='alluvial_fan') ax = kmf.plot() kmf.fit(basin_trough['T'], basin_trough['C'], left_censorship=True, label='basin_trough') ax = kmf.plot(ax=ax) self.plt.title("test_kmf_left_censorship_plots") self.plt.show(block=block) return
def test_flat_style_and_marker(self, block): data1 = np.random.exponential(10, size=200) data2 = np.random.exponential(2, size=200) C1 = np.random.binomial(1, 0.9, size=200) C2 = np.random.binomial(1, 0.95, size=200) kmf = KaplanMeierFitter() kmf.fit(data1, C1, label='test label 1') ax = kmf.plot(flat=True, censor_styles={'marker': '+', 'mew': 2, 'ms': 7}) kmf.fit(data2, C2, label='test label 2') kmf.plot(ax=ax, censor_styles={'marker': 'o', 'ms': 7}, flat=True) self.plt.title("testing kmf flat styling + marker") self.plt.show(block=block) return
def test_kmf_left_censorship_plots(self, block): matplotlib = pytest.importorskip("matplotlib") from matplotlib import pyplot as plt kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.ix[lcd_dataset['group'] == 'alluvial_fan'] basin_trough = lcd_dataset.ix[lcd_dataset['group'] == 'basin_trough'] kmf.fit(alluvial_fan['T'], alluvial_fan['C'], left_censorship=True, label='alluvial_fan') ax = kmf.plot() kmf.fit(basin_trough['T'], basin_trough['C'], left_censorship=True, label='basin_trough') ax = kmf.plot(ax=ax) plt.show(block=block) return
def test_predict_method_returns_gives_values_prior_to_the_value_in_the_survival_function( self): T = [1, 2, 3] kmf = KaplanMeierFitter() kmf.fit(T) assert abs(kmf.predict(0.5) - kmf.survival_function_.ix[0].values) < 10e-8 assert abs(kmf.predict(1.9999) - kmf.survival_function_.ix[1].values) < 10e-8
def test_kmf_left_censorship_plots(self): matplotlib = pytest.importorskip("matplotlib") from matplotlib import pyplot as plt kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.ix[lcd_dataset['group'] == 'alluvial_fan'] basin_trough = lcd_dataset.ix[lcd_dataset['group'] == 'basin_trough'] kmf.fit(alluvial_fan['T'], alluvial_fan['C'], left_censorship=True, label='alluvial_fan') ax = kmf.plot() kmf.fit(basin_trough['T'], basin_trough['C'], left_censorship=True, label='basin_trough') ax = kmf.plot(ax=ax) plt.show() return
def test_kmf_left_censorship_plots(self, block): kmf = KaplanMeierFitter() lcd_dataset = load_lcd() alluvial_fan = lcd_dataset.loc[lcd_dataset["group"] == "alluvial_fan"] basin_trough = lcd_dataset.loc[lcd_dataset["group"] == "basin_trough"] kmf.fit(alluvial_fan["T"], alluvial_fan["C"], left_censorship=True, label="alluvial_fan") ax = kmf.plot() kmf.fit(basin_trough["T"], basin_trough["C"], left_censorship=True, label="basin_trough") ax = kmf.plot(ax=ax) self.plt.title("test_kmf_left_censorship_plots") self.plt.show(block=block) return
def plot_KM(stime, censor, g1, pval, figname): sns.set_style('white') kmf = KaplanMeierFitter() f, ax = plt.subplots(figsize=(3, 3)) np.set_printoptions(precision=2, suppress=False) kmf.fit(stime[g1], event_observed=censor[g1], label=["high-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) kmf.fit(stime[~g1], event_observed=censor[~g1], label=["low-risk group"]) kmf.plot(ax=ax, ci_show=False, show_censors=True) ax.grid(b=False) sns.despine() plt.ylim(0,1) plt.xlabel("time", fontsize=14) plt.ylabel("survival", fontsize=14) plt.text(0.7, 0.85, 'pval = %.2e' % (pval), fontdict={'size': 12}, horizontalalignment='center', verticalalignment='center', transform=ax.transAxes) plt.xticks(rotation=45) for item in (ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(10) plt.tight_layout() plt.savefig(figname, format='eps') plt.close()
def test_seaborn_doesnt_cause_kmf_plot_error(self, block, kmf, capsys): import seaborn as sns df = load_waltons() T = df["T"] E = df["E"] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) kmf.plot() self.plt.title("test_seaborn_doesnt_cause_kmf_plot_error") self.plt.show(block=block) _, err = capsys.readouterr() assert err == ""
def survival(request, indel_id): gene = get_object_or_404(Indel, pk=indel_id).related_gene.id cancer = request.GET.get('cancer') threshold = float(request.GET.get('threshold', '2.0')) try: clinical = pd.read_table('{}/{}/{}.txt'.format(settings.SURVIVAL_ROOT, cancer, gene)) except FileNotFoundError: return HttpResponseNotFound() alterations = clinical.loc[ clinical['expression'].abs() >= threshold].copy() no_alterations = clinical.loc[ clinical['expression'].abs() < threshold].copy() kmf = KaplanMeierFitter() alterations_kmf = kmf.fit(alterations['OS_MONTHS'], alterations['EVENT'], label='alterations') kmf = KaplanMeierFitter() no_alterations_kmf = kmf.fit(no_alterations['OS_MONTHS'], no_alterations['EVENT'], label='no_alterations') sumarry = logrank_test(alterations['OS_MONTHS'], no_alterations['OS_MONTHS'], alterations['EVENT'], no_alterations['EVENT'], alpha=0.99) return JsonResponse( dict( alterations_time=alterations_kmf.survival_function_.index.tolist(), alterations_upper=alterations_kmf. confidence_interval_['alterations_upper_0.95'].fillna(1).tolist(), alterations_lower=alterations_kmf. confidence_interval_['alterations_lower_0.95'].fillna(1).tolist(), alterations_survival=alterations_kmf. survival_function_['alterations'].tolist(), no_alterations_time=no_alterations_kmf.survival_function_.index. tolist(), no_alterations_upper=no_alterations_kmf.confidence_interval_[ 'no_alterations_upper_0.95'].fillna(1).tolist(), no_alterations_lower=no_alterations_kmf.confidence_interval_[ 'no_alterations_lower_0.95'].fillna(1).tolist(), no_alterations_survival=no_alterations_kmf. survival_function_['no_alterations'].tolist(), p_value=sumarry.p_value))
def test_kaplan_meier_with_censorship(self, sample_lifetimes): T, C = sample_lifetimes kmf = KaplanMeierFitter() kmf.fit(T, C) npt.assert_almost_equal(kmf.survival_function_.values, self.kaplan_meier(T, C))
def uw_tier_histplots(): sample['Underwriter Tier'] = sample['lead_underwriter_tier'] sample['IPO Duration'] = sample['IPO_duration'] ranks = ["-1", "0+", "7+", "9"] def uw_tier_duration(x): return sample[sample.lead_underwriter_tier==x]['IPO_duration'] kwstat = kruskalwallis(*[uw_tier_duration(x) for x in ranks]) # g = sb.FacetGrid(sample, # row="Underwriter Tier", # hue="Underwriter Tier", # palette=cp_four("cool_r"), # size=2, aspect=4, # hue_order=ranks, row_order=ranks, # legend=ranks, xlim=(0,1095)) # g.map(sb.distplot, "IPO Duration") # plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200) from lifelines.estimation import KaplanMeierFitter from lifelines.statistics import logrank_test import matplotlib.pyplot as plt ranks = ["-1", "0+", "7+", "9"] ranklabels = ['No Underwriter', 'Low Rank', 'Mid Rank', 'Rank 9 (elite)'] kmf = KaplanMeierFitter() # Success f, ax = plt.subplots(1,1,figsize=(12, 4), sharex=True) T = 1 # annotation line thickness for rank, rlabel, color in zip(ranks, ranklabels, cp_four("cool_r")): uw = sample[sample.lead_underwriter_tier==rank] kmf.fit(uw['IPO_duration'], label='{} N={}'.format(rlabel, len(uw)), alpha=0.9) kmf.plot(ax=ax, c=color, alpha=0.7) quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1] aprops = dict(facecolor=color, width=T, headwidth=T) if rank=="-1": plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+145, 0.25+.04), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+145, 0.50+.04), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+145, 0.75+0.04), arrowprops=aprops) elif rank=="9": plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+415, 0.25+.1), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+290, 0.50+.1), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+165, 0.75+0.1), arrowprops=aprops) plt.annotate("Kruskall Wallis\nH: {:.3f}\nprob: {:.3f}".format(*kwstat), (960, 0.1)) plt.ylim(0,1) plt.xlim(0,1095) plt.title("Kaplan-Meier survival times by bank tier") plt.xlabel("IPO Duration (days)") plt.ylabel(r"$S(t)=Pr(T>t)$") plt.savefig("IPO_tiers_KP_survival.pdf", format='pdf', dpi=200)
def test_predict_method_returns_gives_values_prior_to_the_value_in_the_survival_function(self): T = [1, 2, 3] kmf = KaplanMeierFitter() kmf.fit(T) assert abs(kmf.predict(0.5) - kmf.survival_function_.ix[0].values) < 10e-8 assert abs(kmf.predict(1.9999) - kmf.survival_function_.ix[1].values) < 10e-8
def plot_kaplan_function(duration_key): from lifelines.estimation import KaplanMeierFitter from lifelines.statistics import logrank_test import matplotlib.pyplot as plt duration_keys = ["days_from_priced_to_listing", "days_to_final_price_revision", # "days_to_first_price_update", "days_from_s1_to_listing", "days_to_first_price_change"] duration_key = duration_keys[-1] kmf = KaplanMeierFitter() f, ax = plt.subplots(1,1,figsize=(12, 4), sharex=True) T = 1 # annotation line thickness xoffset = 0.4 # annotation offset (x-axis) yoffset = 0.04 # Above filing price range kmf.fit(above[duration_key], label='Upward Price Amendment: N={}'.format(len(above)), alpha=0.9) kmf.plot(ax=ax, c=colors[5], alpha=0.7) quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1] aprops = dict(facecolor=colors[5], width=T, headwidth=T) plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+xoffset, 0.25+yoffset), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+xoffset, 0.50+yoffset), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+xoffset, 0.75+yoffset), arrowprops=aprops) # Under filing price range kmf.fit(under[duration_key], label='Downward Price Amendment: N={}'.format(len(under)),) kmf.plot(ax=ax, c=colors[2], alpha=0.7) quartiles = [int(np.percentile(kmf.durations, x)) for x in [25, 50, 75]][::-1] aprops = dict(facecolor=colors[2], width=T, headwidth=T) plt.annotate("75%: {} days".format(quartiles[0]), (quartiles[0], 0.25), xytext=(quartiles[0]+xoffset, 0.25+yoffset+0.05), arrowprops=aprops) plt.annotate("50%: {} days".format(quartiles[1]), (quartiles[1], 0.50), xytext=(quartiles[1]+xoffset, 0.50+yoffset+0.05), arrowprops=aprops) plt.annotate("25%: {} days".format(quartiles[2]), (quartiles[2], 0.75), xytext=(quartiles[2]+xoffset, 0.75+yoffset+0.05), arrowprops=aprops) # log rank tests + general graph labels # summary, p_value, results = logrank_test( # above[duration_key], # within[duration_key], # under[duration_key], # alpha=0.95) # ax.annotate("Log-rank test: (prob={p:.3f})".format(p=p_value), # xy=(1210, 0.08)) plt.ylim(0,1) plt.xlim(0, max(np.percentile(above[duration_key], 90), np.percentile(under[duration_key],90))) plt.title("Kaplan-Meier Survival Functions") plt.xlabel("Delay (days) in {}".format(duration_key)) plt.ylabel(r"$S(t)=Pr(T>t)$")
# print(df.head()) print(df['lenfol'].describe()) # look at how long a patient lives dead = df[df['fstat'] > 0] dead.hist(bins=20, column='lenfol') plt.show() #plot the cumulative hazard (cdf) dead.hist(bins=100, column='lenfol', cumulative=True, normed=1) plt.show() #plot survival curve kaplen_meier = KaplanMeierFitter() time_of_event = df['lenfol']; event = df['fstat']; time = np.linspace(0, 2500, 100) kaplen_meier.fit(time_of_event, timeline=time, event_observed=event, label='All patients') kaplen_meier.plot() plt.show() #stratify Congestive Heart Complications history = df['chf'] == 1; kaplen_meier = KaplanMeierFitter() kaplen_meier.fit(time_of_event[history], timeline=time, event_observed=event[history], label='Congestive heart complications') ax = kaplen_meier.plot()
print("White Medium Hazard: %.2f" % (math.exp(0.7736))) df5 = df3[df3[''] df5 = df3[['duration', 'event']] cph.fit(df = df5, duration_col = 'duration', event_col = 'event') cph.predict_survival_function(X = df5).plot() #Kaplan Meier plots from lifelines.estimation import KaplanMeierFitter kmf = KaplanMeierFitter() df6 = df3[['duration', 'event']] kmf.fit(df6['duration'],df6['event']) kmf.plot() #how does the survival curve look alike for black people df6a = df3[df3['race_factor'] == 'African-American'] df6a = df6a[df6a['score_factor'] == 'Low'] df6b = df6a[['duration', 'event']] kmf.fit(df6b['duration'],df6b['event']) kmf.plot()
get_ipython().magic(u'R -o p') # Render HTML HTML(p[0]) # The `y axis` represents the probability a patient is still alive at time $t$ weeks. We see a steep drop off within the first 100 weeks, and then observe the curve flattening. The dotted lines represent the 95% confidence intervals. # ### Using Python # We will now replicate the above steps using python. Above, we have already specified a variable `tongues` that holds the data in a pandas dataframe. # In[15]: from lifelines.estimation import KaplanMeierFitter kmf = KaplanMeierFitter() # The method takes the same parameters as it's R counterpart, a time vector and a vector indicating which observations are observed or censored. The model fitting sequence is similar to the [scikit-learn](http://scikit-learn.org/stable/) api. # In[16]: f = tongue.type==1 T = tongue[f]['time'] C = tongue[f]['delta'] kmf.fit(T, event_observed=C) # To get a plot with the confidence intervals, we simply can call `plot()` on our `kmf` object.
def test_predict_methods_returns_a_scalar_or_a_array_depending_on_input(self, sample_lifetimes): kmf = KaplanMeierFitter() kmf.fit(sample_lifetimes[0]) assert not isinstance(kmf.predict(1), Iterable) assert isinstance(kmf.predict([1, 2]), Iterable)
def overall_survival_analysis(data): list_patients = [] for patient, data in data.items(): patient_info = [] v_status = data['VitalStatus'] s_time = data['SurvivalTime'] if v_status == 'Alive' or 'Dead': if v_status == 'Alive': v_status = 0 else: v_status = 1 patient_info.append(v_status) if type(s_time) != str: patient_info.append(s_time) list_patients.append(patient_info) df = pd.DataFrame(list_patients) num_patients = len(df) df.columns = ['Event', 'Duration'] kmf = KaplanMeierFitter() kmf.fit(durations=df.Duration, event_observed=df.Event) #print(kmf.survival_function_) coordinates = [] survival_fx = (kmf.survival_function_) coordinates_y = list(survival_fx.values.flatten()) coordinates_x = [] for row in survival_fx.iterrows(): timeline, km_estimate = row coordinates_x.append(timeline.tolist()) for (x, y) in zip(coordinates_x, coordinates_y): coordinates.append([x, y]) #calculate the survival probability for t=1 year surv_for_1 = kmf.predict(12) #caluclate the survival probability for t=3 years surv_for_3 = kmf.predict(36) #calculate the survival probability for t=5 surv_for_5 = kmf.predict(60) surv_median = int(round(kmf.median_)) year_1_surv = int(round(surv_for_1 * 100)) year_3_surv = int(round(surv_for_3 * 100)) year_5_surv = int(round(surv_for_5 * 100)) overall_surv_stats = {} overall_surv_stats['Coordinates'] = coordinates overall_surv_stats['Median'] = surv_median overall_surv_stats['1Year'] = year_1_surv overall_surv_stats['3Year'] = year_3_surv overall_surv_stats['5Year'] = year_5_surv return overall_surv_stats
def progression_free_analysis(data): list_patients = [] for patient, data in data.items(): v_status = data['VitalStatus'] d_progression = data['DiseaseProgression'] s_time = data['SurvivalTime'] patient_info = [] if d_progression != None: if v_status != None: if v_status == 'Alive' or d_progression == 'False': #patient disease did not progress progression_status = 0 else: progression_status = 1 #patient disease did progress patient_info.append(progression_status) if type(s_time) != str: patient_info.append(s_time) list_patients.append(patient_info) df = pd.DataFrame(list_patients) num_patients = len(df) df.columns = ['Event', 'Duration'] kmf = KaplanMeierFitter() kmf.fit(durations=df.Duration, event_observed=df.Event) coordinates = [] survival_fx = kmf.survival_function_ coordinates_y = list(survival_fx.values.flatten()) coordinates_x = [] for row in survival_fx.iterrows(): timeline, km_estimate = row coordinates_x.append(timeline.tolist()) for (x, y) in zip(coordinates_x, coordinates_y): coordinates.append([x, y]) #calculate the progression free survival probability for t=1 year surv_for_1 = kmf.predict(12) #calculate the progression free survival probability for t=3 years surv_for_3 = kmf.predict(36) #calculate the progression free survival probability for t=5 years surv_for_5 = kmf.predict(60) surv_median = int(round(kmf.median_)) year_1_surv = int(round(surv_for_1 * 100)) year_3_surv = int(round(surv_for_3 * 100)) year_5_surv = int(round(surv_for_5 * 100)) prog_free_stats = {} prog_free_stats['Coordinates'] = coordinates prog_free_stats['Median'] = surv_median prog_free_stats['1Year'] = year_1_surv prog_free_stats['3Year'] = year_3_surv prog_free_stats['5Year'] = year_5_surv return prog_free_stats
cluster_list = [s.rstrip() for s in cluster_l] np.array(cluster_list) cluster_list # In[25]: df = pd.read_csv(clinical_filename, delimiter='\t') df['group'] = cluster_list df.head() # In[26]: kmf = KaplanMeierFitter() ax = plt.subplot(111) plt.rcParams['font.family'] = 'Arial' for group in sorted(df['group'].unique()): g = df.group == group T = df[g]['days_to_last_followup'] C = df[g]['event'] kmf.fit(T, event_observed=C, label='Cluster - ' + group + ' (' + str(len(T)) + ')') kmf.survival_function_.plot(ax=ax, linewidth=4.0) kmf2 = plt.gcf() plt.title(cancer_name,fontsize=30) plt.xlabel('Time in Days',fontsize=30) plt.ylabel('Survival Rate',fontsize=30) plt.rc('xtick', labelsize=20) plt.rc('ytick', labelsize=20)
def test_sort_doesnt_affect_kmf(self, sample_lifetimes): T, _ = sample_lifetimes kmf = KaplanMeierFitter() assert_frame_equal(kmf.fit(T).survival_function_, kmf.fit(sorted(T)).survival_function_)
def kmf(self): return KaplanMeierFitter()
def test_predict_method_returns_exact_value_if_given_an_observed_time(self): T = [1, 2, 3] kmf = KaplanMeierFitter() kmf.fit(T) time = 1 assert abs(kmf.predict(time) - kmf.survival_function_.ix[time].values) < 10e-8
else: v_status = 1 patient_info.append(v_status) if type(s_time) != str: patient_info.append(s_time) list_patients.append(patient_info) df = pd.DataFrame(list_patients) num_patients = len(df) print(str(num_patients) + " patients used in analysis") df.columns = ['Event', 'Duration'] kmf = KaplanMeierFitter() kmf.fit(durations=df.Duration, event_observed=df.Event) #median survival in months print("median survival: " + str(kmf.median_) + " months") #print(kmf.survival_function_) coordinates = [] survival_fx = (kmf.survival_function_) coordinates_y = list(survival_fx.values.flatten()) coordinates_x = [] for row in survival_fx.iterrows(): timeline, km_estimate = row coordinates_x.append(timeline.tolist())
df = pd.read_table('clinical_data.tab',sep='\t') #df2= pd.read_table('genomicMatrix.tab',sep='\t') #print list(df.columns.values) survival_col = '_OS' censor_col = '_OS_IND' clinical_predictors = ['age_at_initial_pathologic_diagnosis'] df = df[pd.notnull(df[survival_col])] tx = df['history_of_neoadjuvant_treatment']=='Yes' ax = plt.subplot(111) kmf1 = KaplanMeierFitter(alpha=0.95) kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes']) kmf1.plot(ax=ax, show_censors=True, ci_show=False) kmf2 = KaplanMeierFitter(alpha=0.95) kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No']) kmf2.plot(ax=ax, show_censors=True, ci_show=False ) add_at_risk_counts(kmf1, kmf2, ax=ax) plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx') plt.xlabel(survival_col) plt.savefig('km.png') results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 ) results.print_summary()
def test_kmf_with_inverted_axis(self, block, kmf): T = np.random.exponential(size=100) kmf = KaplanMeierFitter() kmf.fit(T, label="t2") ax = kmf.plot(invert_y_axis=True, at_risk_counts=True) T = np.random.exponential(3, size=100) kmf = KaplanMeierFitter() kmf.fit(T, label="t1") kmf.plot(invert_y_axis=True, ax=ax, ci_force_lines=False) self.plt.title("test_kmf_with_inverted_axis") self.plt.show(block=block)
def survival_estimation(directory=tmp_dir): """ Use the Kaplan-Meier Estimate to estimate the survival function see: https://github.com/CamDavidsonPilon/lifelines """ from lifelines.estimation import KaplanMeierFitter df = get_lifetime_data_frame(recompute=False) # Estimate the survival function for all developers T = df['duration'] C = df['censored'] kmf = KaplanMeierFitter() kmf.fit(T, event_observed=C, label='All developers') print("Median survival time for all developers: {} years".format( kmf.median_)) fig = plt.figure(figsize=(10, 8)) ax = plt.subplot(111) kmf.plot(ax=ax, color=color_map(2)) plt.ylabel('Survival probablility') plt.xlabel('Time in years') plt.ylim(0, 1) plt.grid() #plt.title("Estimated Survival function for developer activity") if directory is None: plt.ion() plt.show() else: plt.savefig('{0}/survival_all.png'.format(directory)) plt.savefig('{0}/survival_all.pdf'.format(directory)) plt.close() # Estimate the survival function by connectivity level mtop = df['top'] == 1 kmf = KaplanMeierFitter() fig = plt.figure(figsize=(10, 8)) ax = plt.subplot(111) kmf.fit(T[mtop], event_observed=C[mtop], label="Top connectivity level") print("Median survival time for top developers: {} years".format( kmf.median_)) kmf.plot(ax=ax, color=color_map(0)) kmf.fit(T[~mtop], event_observed=C[~mtop], label="Not in the top") print("Median survival time for not top developers: {} years".format( kmf.median_)) kmf.plot(ax=ax, color=color_map(1)) plt.ylabel('Survival probablility') plt.xlabel('Time in years') plt.ylim(0, 1) plt.grid() #plt.title("Estimated Survival function for top level connectivity") if directory is None: plt.ion() plt.show() else: plt.savefig('{0}/survival_top.png'.format(directory)) plt.savefig('{0}/survival_top.pdf'.format(directory)) plt.close()
def test_kmf_plotting(self, block): data1 = np.random.exponential(10, size=(100)) data2 = np.random.exponential(2, size=(200, 1)) data3 = np.random.exponential(4, size=(500, 1)) kmf = KaplanMeierFitter() kmf.fit(data1, label='test label 1') ax = kmf.plot() kmf.fit(data2, label='test label 2') kmf.plot(ax=ax) kmf.fit(data3, label='test label 3') kmf.plot(ax=ax) self.plt.title("test_kmf_plotting") self.plt.show(block=block) return