Example #1
0
def test_datetimes_to_durations_with_different_frequencies():
    # days
    start_date = ["2013-10-10 0:00:00", "2013-10-09", "2012-10-10"]
    end_date = ["2013-10-13", "2013-10-10 0:00:00", "2013-10-15"]
    T, C = utils.datetimes_to_durations(start_date, end_date)
    npt.assert_almost_equal(T, np.array([3, 1, 5 + 365]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))

    # years
    start_date = ["2013-10-10", "2013-10-09", "2012-10-10"]
    end_date = ["2013-10-13", "2013-10-10", "2013-10-15"]
    T, C = utils.datetimes_to_durations(start_date, end_date, freq="Y")
    npt.assert_almost_equal(T, np.array([0, 0, 1]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))

    # hours
    start_date = [
        "2013-10-10 17:00:00", "2013-10-09 0:00:00", "2013-10-10 23:00:00"
    ]
    end_date = [
        "2013-10-10 18:00:00", "2013-10-10 0:00:00", "2013-10-11 2:00:00"
    ]
    T, C = utils.datetimes_to_durations(start_date, end_date, freq="h")
    npt.assert_almost_equal(T, np.array([1, 24, 3]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
Example #2
0
def plot_lifetimes_for_diagnosis(df, diagnosis, current_time=50, subset_size=80):
    df = df.loc[df['diagnosis'] == diagnosis]

    # create figure and axes
    fig, ax = plt.subplots()

    # specify type for PyCharm help
    fig: plt.Figure = fig
    ax: plt.Axes = ax

    if df.shape[0] >= subset_size:
        df = df.sample(n=subset_size, random_state=1)

    start_times, end_times = df['start_date'], df['end_date']

    actual_lifetimes, death_observed = datetimes_to_durations(start_times, end_times, freq='M')

    plot_lifetimes(durations=actual_lifetimes, event_observed=death_observed, ax=ax)

    ax.set_title(diagnosis_list[diagnosis])
    ax.set_xlabel('Čas od začátku sledování po vznik události v měsících')
    ax.set_ylabel('Sledovaná osoba')

    # show and save plot
    fig.show()
    fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
Example #3
0
def test_datetimes_to_durations_hours():
    start_date = ['2013-10-10 17:00:00', '2013-10-09 0:00:00', '2013-10-10 23:00:00']
    end_date = ['2013-10-10 18:00:00', '2013-10-10 0:00:00', '2013-10-11 2:00:00']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='h')
    npt.assert_almost_equal(T, np.array([1, 24, 3]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
    return
Example #4
0
def test_datetimes_to_durations_years():
    start_date = ['2013-10-10', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', '2013-10-10', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y')
    npt.assert_almost_equal(T, np.array([0, 0, 1]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
    return
Example #5
0
def test_datetimes_to_durations_days():
    start_date = ['2013-10-10 0:00:00', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', '2013-10-10 0:00:00', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date, end_date)
    npt.assert_almost_equal(T, np.array([3, 1, 5 + 365]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
    return
Example #6
0
def test_datetimes_to_durations_years():
    start_date = ['2013-10-10', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', '2013-10-10', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y')
    npt.assert_almost_equal(T, np.array([0, 0, 1]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
    return
Example #7
0
def test_datetimes_to_durations_will_handle_dates_above_multi_fill_date():
    start_date = ["2013-10-08", "2013-10-09", "2013-10-10"]
    end_date = ["2013-10-10", None, "2013-10-20"]
    last_observation = ["2013-10-10", "2013-10-12", "2013-10-14"]
    T, E = utils.datetimes_to_durations(start_date, end_date, freq="D", fill_date=last_observation)
    npt.assert_almost_equal(E, np.array([1, 0, 0], dtype=bool))
    npt.assert_almost_equal(T, np.array([2, 3, 4]))
Example #8
0
def test_datetimes_to_durations_days():
    start_date = ['2013-10-10 0:00:00', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', '2013-10-10 0:00:00', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date, end_date)
    npt.assert_almost_equal(T, np.array([3, 1, 5 + 365]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
    return
Example #9
0
def test_datetimes_to_durations_will_handle_dates_above_fill_date():
    start_date = ['2013-10-08', '2013-10-09', '2013-10-10']
    end_date = ['2013-10-10', '2013-10-12', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date,
                                        end_date,
                                        freq='Y',
                                        fill_date='2013-10-12')
    npt.assert_almost_equal(C, np.array([1, 1, 0], dtype=bool))
Example #10
0
def test_datetimes_to_durations_will_handle_dates_above_fill_date():
    start_date = ["2013-10-08", "2013-10-09", "2013-10-10"]
    end_date = ["2013-10-10", "2013-10-12", "2013-10-15"]
    T, C = utils.datetimes_to_durations(start_date,
                                        end_date,
                                        freq="Y",
                                        fill_date="2013-10-12")
    npt.assert_almost_equal(C, np.array([1, 1, 0], dtype=bool))
Example #11
0
def test_datetimes_to_durations_custom_censor():
    start_date = ["2013-10-10", "2013-10-09", "2012-10-10"]
    end_date = ["2013-10-13", "NaT", ""]
    T, C = utils.datetimes_to_durations(start_date,
                                        end_date,
                                        freq="Y",
                                        na_values=["NaT", ""])
    npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
Example #12
0
def test_datetimes_to_durations_custom_censor():
    start_date = ['2013-10-10', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', "NaT", '']
    T, C = utils.datetimes_to_durations(start_date,
                                        end_date,
                                        freq='Y',
                                        na_values="NaT")
    npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
    return
def survival_info_adding(df: pd.DataFrame):
    df['Last Event'] = df['Last FU']
    df['Event'] = (df['Status'] == 'Dead')
    df.loc[df['Event'] , 'Last Event'] = df.loc[df['Event'], 'Date of Death']
    start_date = df['RT End']
    end_date = df['Last Event']
    T_old, _ = datetimes_to_durations(start_date, end_date)
    df['Survival Time'] = T_old /365
    df['High_Risk'] = df['Survival Time']<= 4
    return df
Example #14
0
def test_datetimes_to_durations_hours():
    start_date = [
        '2013-10-10 17:00:00', '2013-10-09 0:00:00', '2013-10-10 23:00:00'
    ]
    end_date = [
        '2013-10-10 18:00:00', '2013-10-10 0:00:00', '2013-10-11 2:00:00'
    ]
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='h')
    npt.assert_almost_equal(T, np.array([1, 24, 3]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
    return
Example #15
0
def test_datetimes_to_durations_with_different_frequencies():
    # days
    start_date = ['2013-10-10 0:00:00', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', '2013-10-10 0:00:00', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date, end_date)
    npt.assert_almost_equal(T, np.array([3, 1, 5 + 365]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))

    # years
    start_date = ['2013-10-10', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', '2013-10-10', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y')
    npt.assert_almost_equal(T, np.array([0, 0, 1]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))

    # hours
    start_date = ['2013-10-10 17:00:00', '2013-10-09 0:00:00', '2013-10-10 23:00:00']
    end_date = ['2013-10-10 18:00:00', '2013-10-10 0:00:00', '2013-10-11 2:00:00']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='h')
    npt.assert_almost_equal(T, np.array([1, 24, 3]))
    npt.assert_almost_equal(C, np.array([1, 1, 1], dtype=bool))
Example #16
0
def kaplan_meier_analysis(df, rows=2, columns=3):
    """
    Kaplan-Meier estimates for individual diagnoses

    :param df: Pandas dataframe with data
    :param rows: number of rows for subplots
    :param columns: number of columns for subplots
    :return: Plots the data
    """
    fig, axes = plt.subplots(rows, columns, figsize=[10, 6])

    for pos, diagnosis in enumerate(diagnosis_list):
        timeline = np.linspace(0, 12)

        # initialize start and end times
        start_times, end_times = start_end_for_diagnosis(df, diagnosis)

        # get data in the right format - t is time_span, e is event (1 is death)
        t, e = datetimes_to_durations(start_times, end_times, freq='M')  # M - months, D - days

        # initialize Kaplan-Meier fitters
        kmf = KaplanMeierFitter()

        # fit the data
        kmf.fit(t, event_observed=e, timeline=timeline)

        # get the plot position
        ax: plt.Axes = axes[pos % rows, pos % columns]

        # plot Kaplan-Meier
        kmf.plot(ax=ax)

        # create legend
        legend_elements = [Line2D([0], [0], color='b', lw=1, label=f'n = {len(start_times)}')]

        # format plot
        ax.set_ylim(0, 1)
        ax.set_xlabel('Time in months')
        ax.set_title('\n'.join(wrap(diagnosis_list[diagnosis], 30)))
        ax.legend(handles=legend_elements, loc='lower right')
        ax.margins(y=50)

    # show plot
    fig.show()
    fig.savefig(f'paper/img/image_{next(GEN)}.pdf', format='pdf')
Example #17
0
def plot_kp_time_to_next_treatment(lot, line, metric, cohort_enhanced, censoring_date=None,
                                   displayed_regimen=None, timeline='M'):
    """
    Inputs:
    - metric: 'time_to_next_treatment', 'time_to_last_activity'
    """
    kmf = KaplanMeierFitter()
    figsize(10, 8)
    ax = plt.subplot(111)
    if timeline == 'M':
        x_label = 'Duration in months'
    elif timeline == 'D':
        x_label = 'Duration in days'
    elif timeline == 'Y':
        x_label = 'Duration in years'
    
    if metric == 'time_to_last_activity':
        lot = lot.merge(cohort_enhanced, how='left', on='person_id')
        title = "Time to last activity for different regimens"
        end_date = 'last_activity_date'
        
    elif metric == 'time_to_next_treatment':
        title = "Time to next treatment for different regimens"
        end_date = 'end_date'
    
    df = lot[lot['line_number']==line]
    groups = df['regimen_name']
    
    if displayed_regimen is None:
        labels = groups.unique().tolist()
    else:
        labels = displayed_regimen

    for a, label in enumerate(labels):
        i = (groups == label)
        start_dates = df[i]['start_date']
        end_dates = df[i][end_date]
        T, E = datetimes_to_durations(start_dates, end_dates, fill_date = censoring_date, freq=timeline)
        kmf.fit(T, event_observed=E, label=label)
        kmf.plot(ax=ax)
        
    plt.ylim(0, 1)
    plt.xlim(0, 20)
    plt.xlabel(x_label)
    plt.title(title)
Example #18
0
def run_two_churn_defs(df):
    for i, f in enumerate([get_death_time_v1, get_death_time_v2]):
        # Calculate the death date over all rows
        df['death_date'] = df[cols_payments].apply(lambda x: f(x), axis=1)

        # Create duration and churn status
        start_times = df['incorporation_date']
        end_times = df['death_date']
        obs_time = datetime.datetime(2015, 1, 1)
        T, E = datetimes_to_durations(start_times,
                                      end_times,
                                      freq='M',
                                      fill_date=obs_time)
        df['T'] = T  # duration (in months)
        df['E'] = E  # churn status

        kmf = KaplanMeierFitter()
        kmf.fit(T, event_observed=E)  # or, more succiently, kmf.fit(T, E)
        if i == 0:
            ax = kmf.plot()
        else:
            kmf.plot(ax=ax)
    # plt.title('KM Survival Function using v1 churn def')
    pass
data['quit_date'] = data.quit_date.fillna('2015-12-13')

data['quit_date'] = pd.to_datetime(data['quit_date'])
data['join_date'] = pd.to_datetime(data['join_date'])

# data['duration'] = data['quit_date'] - data['join_date']

# Turn this into an integer so we can work with it
# data['duration'] = data['duration'].dt.days

# Actually, there is a built in function to get DURATION appropriate for SA
from lifelines.utils import datetimes_to_durations

start_date = data['join_date']
end_date = data['quit_date']
T, E = datetimes_to_durations(start_date, end_date)
print('T (durations): ', T)
print('E (event_observed): ', E)
data2 = data
data2['duration'], data2['observed'] = datetimes_to_durations(
    start_date, end_date)

from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()

# Made T and E up there ^

kmf.fit(T, event_observed=E)
kmf.survival_function_.plot()
plt.title('Survival function of employee churn')
Example #20
0
def test_datetimes_to_durations_censor():
    start_date = ["2013-10-10", "2013-10-09", "2012-10-10"]
    end_date = ["2013-10-13", None, ""]
    T, C = utils.datetimes_to_durations(start_date, end_date, freq="Y")
    npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
import pandas as pd
from lifelines.utils import datetimes_to_durations
from lifelines import KaplanMeierFitter


df = pd.read_csv('data/parl_data.csv')
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df['decade'] = df['start_date'].map( lambda d: str(d.year)[:3])
T, C = datetimes_to_durations(df['start_date'], df['end_date'])
df['T'] = T
df['C'] = C

kmf = KaplanMeierFitter()

ax = subplot(111)
for decade in df['decade'].unique():
    ix = df['decade'] == decade
    kmf.fit(df.ix[ix]['T'], df.ix[ix]['C'], label=decade)
    if decade not in ('200', '199'):
        kmf.plot(ax=ax, c='#777777', ci_show=False, alpha = 0.5)
    else:
        kmf.plot(ax=ax, lw=4)
Example #22
0
    df[fecha] = pd.to_datetime(df[fecha])

# Remove rows with ilogical dates:
df.drop(df[df.fecha_egreso < df.fecha_admision].index, inplace=True)
# df[df['fecha_admision'] > df['fecha_egreso']] # check

# Check too large waiting times
df['t_admin_egreso'] = df['fecha_egreso'] - df['fecha_admision']
df.sort_values(by='t_admin_egreso', ascending=False)
df = df[df['t_admin_egreso'].dt.days < 1]

df

#%% Lifelines analysis
from lifelines.utils import datetimes_to_durations
T, E = datetimes_to_durations(start_times=df.fecha_admision, end_times=df.fecha_egreso, freq='h')
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)  # or, more succiently, kmf.fit(T, E)
kmf.survival_function_
kmf.median_
kmf.plot()
plt.show()

#%% Lifelines analysis
from lifelines.utils import datetimes_to_durations
T, E = datetimes_to_durations(start_times=df.fecha_admision, end_times=df.fecha_egreso, freq='h')
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)  # or, more succiently, kmf.fit(T, E)
kmf.survival_function_
Example #23
0
def test_datetimes_to_durations_censor():
    start_date = ['2013-10-10', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', None, '']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y')
    npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
Example #24
0
def test_datetimes_to_durations_will_handle_dates_above_fill_date():
    start_date = ["2013-10-08", "2013-10-09", "2013-10-10"]
    end_date = ["2013-10-10", "2013-10-12", "2013-10-15"]
    T, C = utils.datetimes_to_durations(start_date, end_date, freq="Y", fill_date="2013-10-12")
    npt.assert_almost_equal(C, np.array([1, 1, 0], dtype=bool))
Example #25
0
def test_datetimes_to_durations_custom_censor():
    start_date = ['2013-10-10', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', "NaT", '']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y', na_values=["NaT", ""])
    npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
Example #26
0
def test_datetimes_to_durations_censor():
    start_date = ['2013-10-10', '2013-10-09', '2012-10-10']
    end_date = ['2013-10-13', None, '']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y')
    npt.assert_almost_equal(C, np.array([1, 0, 0], dtype=bool))
Example #27
0
def test_datetimes_to_durations_will_handle_dates_above_fill_date():
    start_date = ['2013-10-08', '2013-10-09', '2013-10-10']
    end_date = ['2013-10-10', '2013-10-12', '2013-10-15']
    T, C = utils.datetimes_to_durations(start_date, end_date, freq='Y', fill_date='2013-10-12')
    npt.assert_almost_equal(C, np.array([1, 1, 0], dtype=bool))
Example #28
0
    # Reorder and select columns
    cols_chars = ['company_id', 'vertical', 'incorporation_date']
    cols_payments = [x for x in df if 'payments' in x]
    df = df[cols_chars + cols_payments]

    # Calculate the death date over all rows
    df['death_date'] = df[cols_payments].apply(lambda x: get_death_time_v2(x),
                                               axis=1)

    # Create duration and churn status
    start_times = df['incorporation_date']
    end_times = df['death_date']
    obs_time = datetime.datetime(2015, 1, 1)
    T, E = datetimes_to_durations(start_times,
                                  end_times,
                                  freq='M',
                                  fill_date=obs_time)
    df['T'] = T  # duration (in months)
    df['E'] = E  # churn status

    kmf = KaplanMeierFitter()

    # vertical_type = 'gym/fitness'
    vertical_types = np.unique(df['vertical'])
    for i, _type in enumerate(vertical_types):
        ix = (df['vertical'] == _type)
        kmf.fit(T[ix], E[ix], label=_type)
        if i == 0:
            ax = kmf.plot()
        else:
            kmf.plot(ax=ax)