def test_datetimes_to_durations_with_different_frequencies(): # days start_date = ["2013-10-10 0:00:00", "2013-10-09", "2012-10-10"] end_date = ["2013-10-13", "2013-10-10 0:00:00", "2013-10-15"] T, C = utils.datetimes_to_durations(start_date, end_date) npt.assert_almost_equal(T, np.array([3, 1, 5 + 365])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) # years start_date = ["2013-10-10", "2013-10-09", "2012-10-10"] end_date = ["2013-10-13", "2013-10-10", "2013-10-15"] T, C = utils.datetimes_to_durations(start_date, end_date, freq="Y") npt.assert_almost_equal(T, np.array([0, 0, 1])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) # hours start_date = [ "2013-10-10 17:00:00", "2013-10-09 0:00:00", "2013-10-10 23:00:00" ] end_date = [ "2013-10-10 18:00:00", "2013-10-10 0:00:00", "2013-10-11 2:00:00" ] T, C = utils.datetimes_to_durations(start_date, end_date, freq="h") npt.assert_almost_equal(T, np.array([1, 24, 3])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
def plot_lifetimes_for_diagnosis(df, diagnosis, current_time=50, subset_size=80): df = df.loc[df['diagnosis'] == diagnosis] # create figure and axes fig, ax = plt.subplots() # specify type for PyCharm help fig: plt.Figure = fig ax: plt.Axes = ax if df.shape[0] >= subset_size: df = df.sample(n=subset_size, random_state=1) start_times, end_times = df['start_date'], df['end_date'] actual_lifetimes, death_observed = datetimes_to_durations(start_times, end_times, freq='M') plot_lifetimes(durations=actual_lifetimes, event_observed=death_observed, ax=ax) ax.set_title(diagnosis_list[diagnosis]) ax.set_xlabel('Čas od začátku sledování po vznik události v měsících') ax.set_ylabel('Sledovaná osoba') # show and save plot fig.show() fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
def test_datetimes_to_durations_hours(): start_date = ['2013-10-10 17:00:00', '2013-10-09 0:00:00', '2013-10-10 23:00:00'] end_date = ['2013-10-10 18:00:00', '2013-10-10 0:00:00', '2013-10-11 2:00:00'] T, C = utils.datetimes_to_durations(start_date, end_date, freq='h') npt.assert_almost_equal(T, np.array([1, 24, 3])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) return
def test_datetimes_to_durations_years(): start_date = ['2013-10-10', '2013-10-09', '2012-10-10'] end_date = ['2013-10-13', '2013-10-10', '2013-10-15'] T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y') npt.assert_almost_equal(T, np.array([0, 0, 1])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) return
def test_datetimes_to_durations_days(): start_date = ['2013-10-10 0:00:00', '2013-10-09', '2012-10-10'] end_date = ['2013-10-13', '2013-10-10 0:00:00', '2013-10-15'] T, C = utils.datetimes_to_durations(start_date, end_date) npt.assert_almost_equal(T, np.array([3, 1, 5 + 365])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) return
def test_datetimes_to_durations_will_handle_dates_above_multi_fill_date(): start_date = ["2013-10-08", "2013-10-09", "2013-10-10"] end_date = ["2013-10-10", None, "2013-10-20"] last_observation = ["2013-10-10", "2013-10-12", "2013-10-14"] T, E = utils.datetimes_to_durations(start_date, end_date, freq="D", fill_date=last_observation) npt.assert_almost_equal(E, np.array([1, 0, 0], dtype=bool)) npt.assert_almost_equal(T, np.array([2, 3, 4]))
def test_datetimes_to_durations_will_handle_dates_above_fill_date(): start_date = ['2013-10-08', '2013-10-09', '2013-10-10'] end_date = ['2013-10-10', '2013-10-12', '2013-10-15'] T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y', fill_date='2013-10-12') npt.assert_almost_equal(C, np.array([1, 1, 0], dtype=bool))
def test_datetimes_to_durations_will_handle_dates_above_fill_date(): start_date = ["2013-10-08", "2013-10-09", "2013-10-10"] end_date = ["2013-10-10", "2013-10-12", "2013-10-15"] T, C = utils.datetimes_to_durations(start_date, end_date, freq="Y", fill_date="2013-10-12") npt.assert_almost_equal(C, np.array([1, 1, 0], dtype=bool))
def test_datetimes_to_durations_custom_censor(): start_date = ["2013-10-10", "2013-10-09", "2012-10-10"] end_date = ["2013-10-13", "NaT", ""] T, C = utils.datetimes_to_durations(start_date, end_date, freq="Y", na_values=["NaT", ""]) npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
def test_datetimes_to_durations_custom_censor(): start_date = ['2013-10-10', '2013-10-09', '2012-10-10'] end_date = ['2013-10-13', "NaT", ''] T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y', na_values="NaT") npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool)) return
def survival_info_adding(df: pd.DataFrame): df['Last Event'] = df['Last FU'] df['Event'] = (df['Status'] == 'Dead') df.loc[df['Event'] , 'Last Event'] = df.loc[df['Event'], 'Date of Death'] start_date = df['RT End'] end_date = df['Last Event'] T_old, _ = datetimes_to_durations(start_date, end_date) df['Survival Time'] = T_old /365 df['High_Risk'] = df['Survival Time']<= 4 return df
def test_datetimes_to_durations_hours(): start_date = [ '2013-10-10 17:00:00', '2013-10-09 0:00:00', '2013-10-10 23:00:00' ] end_date = [ '2013-10-10 18:00:00', '2013-10-10 0:00:00', '2013-10-11 2:00:00' ] T, C = utils.datetimes_to_durations(start_date, end_date, freq='h') npt.assert_almost_equal(T, np.array([1, 24, 3])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) return
def test_datetimes_to_durations_with_different_frequencies(): # days start_date = ['2013-10-10 0:00:00', '2013-10-09', '2012-10-10'] end_date = ['2013-10-13', '2013-10-10 0:00:00', '2013-10-15'] T, C = utils.datetimes_to_durations(start_date, end_date) npt.assert_almost_equal(T, np.array([3, 1, 5 + 365])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) # years start_date = ['2013-10-10', '2013-10-09', '2012-10-10'] end_date = ['2013-10-13', '2013-10-10', '2013-10-15'] T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y') npt.assert_almost_equal(T, np.array([0, 0, 1])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool)) # hours start_date = ['2013-10-10 17:00:00', '2013-10-09 0:00:00', '2013-10-10 23:00:00'] end_date = ['2013-10-10 18:00:00', '2013-10-10 0:00:00', '2013-10-11 2:00:00'] T, C = utils.datetimes_to_durations(start_date, end_date, freq='h') npt.assert_almost_equal(T, np.array([1, 24, 3])) npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
def kaplan_meier_analysis(df, rows=2, columns=3): """ Kaplan-Meier estimates for individual diagnoses :param df: Pandas dataframe with data :param rows: number of rows for subplots :param columns: number of columns for subplots :return: Plots the data """ fig, axes = plt.subplots(rows, columns, figsize=[10, 6]) for pos, diagnosis in enumerate(diagnosis_list): timeline = np.linspace(0, 12) # initialize start and end times start_times, end_times = start_end_for_diagnosis(df, diagnosis) # get data in the right format - t is time_span, e is event (1 is death) t, e = datetimes_to_durations(start_times, end_times, freq='M') # M - months, D - days # initialize Kaplan-Meier fitters kmf = KaplanMeierFitter() # fit the data kmf.fit(t, event_observed=e, timeline=timeline) # get the plot position ax: plt.Axes = axes[pos % rows, pos % columns] # plot Kaplan-Meier kmf.plot(ax=ax) # create legend legend_elements = [Line2D([0], [0], color='b', lw=1, label=f'n = {len(start_times)}')] # format plot ax.set_ylim(0, 1) ax.set_xlabel('Time in months') ax.set_title('\n'.join(wrap(diagnosis_list[diagnosis], 30))) ax.legend(handles=legend_elements, loc='lower right') ax.margins(y=50) # show plot fig.show() fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
def plot_kp_time_to_next_treatment(lot, line, metric, cohort_enhanced, censoring_date=None, displayed_regimen=None, timeline='M'): """ Inputs: - metric: 'time_to_next_treatment', 'time_to_last_activity' """ kmf = KaplanMeierFitter() figsize(10, 8) ax = plt.subplot(111) if timeline == 'M': x_label = 'Duration in months' elif timeline == 'D': x_label = 'Duration in days' elif timeline == 'Y': x_label = 'Duration in years' if metric == 'time_to_last_activity': lot = lot.merge(cohort_enhanced, how='left', on='person_id') title = "Time to last activity for different regimens" end_date = 'last_activity_date' elif metric == 'time_to_next_treatment': title = "Time to next treatment for different regimens" end_date = 'end_date' df = lot[lot['line_number']==line] groups = df['regimen_name'] if displayed_regimen is None: labels = groups.unique().tolist() else: labels = displayed_regimen for a, label in enumerate(labels): i = (groups == label) start_dates = df[i]['start_date'] end_dates = df[i][end_date] T, E = datetimes_to_durations(start_dates, end_dates, fill_date = censoring_date, freq=timeline) kmf.fit(T, event_observed=E, label=label) kmf.plot(ax=ax) plt.ylim(0, 1) plt.xlim(0, 20) plt.xlabel(x_label) plt.title(title)
def run_two_churn_defs(df): for i, f in enumerate([get_death_time_v1, get_death_time_v2]): # Calculate the death date over all rows df['death_date'] = df[cols_payments].apply(lambda x: f(x), axis=1) # Create duration and churn status start_times = df['incorporation_date'] end_times = df['death_date'] obs_time = datetime.datetime(2015, 1, 1) T, E = datetimes_to_durations(start_times, end_times, freq='M', fill_date=obs_time) df['T'] = T # duration (in months) df['E'] = E # churn status kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # or, more succiently, kmf.fit(T, E) if i == 0: ax = kmf.plot() else: kmf.plot(ax=ax) # plt.title('KM Survival Function using v1 churn def') pass
data['quit_date'] = data.quit_date.fillna('2015-12-13') data['quit_date'] = pd.to_datetime(data['quit_date']) data['join_date'] = pd.to_datetime(data['join_date']) # data['duration'] = data['quit_date'] - data['join_date'] # Turn this into an integer so we can work with it # data['duration'] = data['duration'].dt.days # Actually, there is a built in function to get DURATION appropriate for SA from lifelines.utils import datetimes_to_durations start_date = data['join_date'] end_date = data['quit_date'] T, E = datetimes_to_durations(start_date, end_date) print('T (durations): ', T) print('E (event_observed): ', E) data2 = data data2['duration'], data2['observed'] = datetimes_to_durations( start_date, end_date) from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() # Made T and E up there ^ kmf.fit(T, event_observed=E) kmf.survival_function_.plot() plt.title('Survival function of employee churn')
def test_datetimes_to_durations_censor(): start_date = ["2013-10-10", "2013-10-09", "2012-10-10"] end_date = ["2013-10-13", None, ""] T, C = utils.datetimes_to_durations(start_date, end_date, freq="Y") npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
import pandas as pd from lifelines.utils import datetimes_to_durations from lifelines import KaplanMeierFitter df = pd.read_csv('data/parl_data.csv') df['start_date'] = pd.to_datetime(df['start_date']) df['end_date'] = pd.to_datetime(df['end_date']) df['decade'] = df['start_date'].map( lambda d: str(d.year)[:3]) T, C = datetimes_to_durations(df['start_date'], df['end_date']) df['T'] = T df['C'] = C kmf = KaplanMeierFitter() ax = subplot(111) for decade in df['decade'].unique(): ix = df['decade'] == decade kmf.fit(df.ix[ix]['T'], df.ix[ix]['C'], label=decade) if decade not in ('200', '199'): kmf.plot(ax=ax, c='#777777', ci_show=False, alpha = 0.5) else: kmf.plot(ax=ax, lw=4)
df[fecha] = pd.to_datetime(df[fecha]) # Remove rows with ilogical dates: df.drop(df[df.fecha_egreso < df.fecha_admision].index, inplace=True) # df[df['fecha_admision'] > df['fecha_egreso']] # check # Check too large waiting times df['t_admin_egreso'] = df['fecha_egreso'] - df['fecha_admision'] df.sort_values(by='t_admin_egreso', ascending=False) df = df[df['t_admin_egreso'].dt.days < 1] df #%% Lifelines analysis from lifelines.utils import datetimes_to_durations T, E = datetimes_to_durations(start_times=df.fecha_admision, end_times=df.fecha_egreso, freq='h') from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # or, more succiently, kmf.fit(T, E) kmf.survival_function_ kmf.median_ kmf.plot() plt.show() #%% Lifelines analysis from lifelines.utils import datetimes_to_durations T, E = datetimes_to_durations(start_times=df.fecha_admision, end_times=df.fecha_egreso, freq='h') from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() kmf.fit(T, event_observed=E) # or, more succiently, kmf.fit(T, E) kmf.survival_function_
def test_datetimes_to_durations_censor(): start_date = ['2013-10-10', '2013-10-09', '2012-10-10'] end_date = ['2013-10-13', None, ''] T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y') npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
def test_datetimes_to_durations_custom_censor(): start_date = ['2013-10-10', '2013-10-09', '2012-10-10'] end_date = ['2013-10-13', "NaT", ''] T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y', na_values=["NaT", ""]) npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
# Reorder and select columns cols_chars = ['company_id', 'vertical', 'incorporation_date'] cols_payments = [x for x in df if 'payments' in x] df = df[cols_chars + cols_payments] # Calculate the death date over all rows df['death_date'] = df[cols_payments].apply(lambda x: get_death_time_v2(x), axis=1) # Create duration and churn status start_times = df['incorporation_date'] end_times = df['death_date'] obs_time = datetime.datetime(2015, 1, 1) T, E = datetimes_to_durations(start_times, end_times, freq='M', fill_date=obs_time) df['T'] = T # duration (in months) df['E'] = E # churn status kmf = KaplanMeierFitter() # vertical_type = 'gym/fitness' vertical_types = np.unique(df['vertical']) for i, _type in enumerate(vertical_types): ix = (df['vertical'] == _type) kmf.fit(T[ix], E[ix], label=_type) if i == 0: ax = kmf.plot() else: kmf.plot(ax=ax)