def fit(self, X, y, **fit_params):
        X_ = X.copy()
        X_[self.duration_column]=y[self.duration_column]
        if self.event_col is not None:
            X_[self.event_col] = y[self.event_col]

        params = self.get_params()
        est = CoxPHFitter(**params)

        est.fit(X_, duration_col=self.duration_column, event_col=self.event_col, initial_beta=self.initial_beta, include_likelihood=self.include_likelihood, strata=self.strata, **fit_params)
        self.estimator = est
        return self
Beispiel #2
0
def cox_regression(clean_df):
	cf = CoxPHFitter()
	cf.fit(clean_df, 'time', event_col='event')
	summary_df = cf.summary
	#decimals = pd.Series([2, 2, 2], index=['exp(coef)', 'lower 0.95', 'upper 0.95'])
	#summary_df = summary_df.round(decimals)
	ori_dic = summary_df.to_dict()
	res_dic= {}
	for stat_of_interest in stats_of_interest:
		if stat_of_interest != 'p':
			res_dic[stat_of_interest] = round_dic(ori_dic[stat_of_interest])
		else:
			res_dic[stat_of_interest] = round_dic_eng(ori_dic[stat_of_interest])
	return res_dic
Beispiel #3
0
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]):
    """Estimates treatment efficacy using proportional hazards (Cox model).
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    treatment_col : string
        Column in df indicating treatment.
    duration_col : string
        Column in df indicating survival times.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: VE=0"""
    
    coxphf = CoxPHFitter()
    
    coxphf.fit(df[[treatment_col, duration_col, event_col]+covars], duration_col=duration_col, event_col=event_col)
    
    te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col])
    ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound', 'lower-bound']])
    pvalue = coxphf._compute_p_values()[0]

    ind1 = df[treatment_col] == 0
    ind2 = df[treatment_col] == 1
    results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2])
    index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model']
    return pd.Series([te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf], index=index)
Beispiel #4
0
 def test_coxph_plot_partial_effects_on_outcome_with_multiple_variables(
         self, block):
     df = load_rossi()
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest")
     cp.plot_partial_effects_on_outcome(["age", "prio"],
                                        [[10, 0], [50, 10], [80, 90]])
     self.plt.title(
         "test_coxph_plot_partial_effects_on_outcome_with_multiple_variables"
     )
     self.plt.show(block=block)
Beispiel #5
0
def _train_cph(x, t, e, folds, params):

    if params is None:
        l2 = 1e-3
    else:
        l2 = params['l2']

    fold_model = {}

    for f in set(folds):
        df = convert_to_data_frame(x[folds != f], t[folds != f], e[folds != f])
        cph = CoxPHFitter(penalizer=l2).fit(df,
                                            duration_col='T',
                                            event_col='E')
        fold_model[f] = copy.deepcopy(cph)

    return fold_model
Beispiel #6
0
def do_baseline(foldnum, train, valid, exp_code, model_str):
    cph = CoxPHFitter()
    df = pd.DataFrame(train.x)
    print(df.shape)
    df['duration'] = train.y
    df['event'] = [1 if v == 0 else 0 for v in train.c]

    df = df.fillna(df.mean())
    cph.fit(df, 'duration', event_col="event")

    cph.print_summary()

    valid_df = pd.DataFrame(valid.x)
    valid_df = valid_df.fillna(valid_df.mean())
    print(cph.predict_log_partial_hazard(valid_df))
Beispiel #7
0
 def test_coxph_plot_covariate_groups_with_multiple_variables_and_strata(
         self, block):
     df = load_rossi()
     df["strata"] = np.random.choice(["A", "B"], size=df.shape[0])
     cp = CoxPHFitter()
     cp.fit(df, "week", "arrest", strata="strata")
     cp.plot_covariate_groups(["age", "prio"],
                              [[10, 0], [50, 10], [80, 90]])
     self.plt.title(
         "test_coxph_plot_covariate_groups_with_multiple_variables_and_strata"
     )
     self.plt.show(block=block)
Beispiel #8
0
def test_cross_validator_returns_fitters_k_results():
    cf = CoxPHFitter()
    fitters = [cf, cf]
    results = utils.k_fold_cross_validation(fitters,
                                            load_regression_dataset(),
                                            duration_col="T",
                                            event_col="E",
                                            k=3)
    assert len(results) == 2
    assert len(results[0]) == len(results[1]) == 3

    results = utils.k_fold_cross_validation(fitters,
                                            load_regression_dataset(),
                                            duration_col="T",
                                            event_col="E",
                                            k=5)
    assert len(results) == 2
    assert len(results[0]) == len(results[1]) == 5
Beispiel #9
0
def CoxAnalysis(pd_data, pd_surval, tp):
    cph = CoxPHFitter(penalizer=0.1)
    if tp == 'univariate':
        pd_out = ''
        for i in range(pd_data.shape[1]):
            df = pd_surval.T.append(pd_data.iloc[:, i].T).T
            cph.fit(df, 'OS', event_col='status')
            if type(pd_out) == str:
                pd_out = cph.summary
            else:
                pd_out = pd_out.append(cph.summary)
    elif tp == 'multivariable':
        df = pd_data.T.append(pd_surval.T).T
        df = df.dropna(axis=0, how='any')
        cph.fit(df, 'OS', event_col='status', step_size=0.1)
        pd_out = cph.summary
    pd_out.to_csv('CoxRegress.txt', sep='\t', header=True, index=True)
    plt.style.use('my-paper')
    fig, axe = plt.subplots(figsize=(25, 8))
    cph.plot(ax=axe)
    axe.set_ylim(-0.2, 3.2)
    axe.set_xlim(-2.5, 2.1)
    plt.savefig('CoxRegress.pdf')
def COXPH_backward_elimination(COX_dataset,penalizer=0):
    cph = CoxPHFitter(penalizer=penalizer)
    cph.fit(COX_dataset , duration_col='Days', event_col='Vitality')
    
    
    for i in range(COX_dataset.shape[1]-2):
        current_ps = cph.summary['p']
        highest_factor = current_ps.idxmax()
        highest_p = current_ps.max()
        if highest_p<0.05:
            break
        
        COX_dataset = COX_dataset.drop(highest_factor,axis=1)
        cph.fit(COX_dataset, duration_col='Days', event_col='Vitality')

    return cph
Beispiel #11
0
def cox(d_male, d_female):
    df_male = pd.DataFrame({
        "time":d_male,
        "event":1,
        "sex": 0
        })
    df_female = pd.DataFrame({
        "time":d_female,
        "event":1,
        "sex": 1
        })
    df = pd.concat([df_male, df_female])
    
    print(len(d_male), len(d_female))
    cph = CoxPHFitter()
    cph.fit(df, duration_col="time", event_col="event")
    cph.print_summary()
    def _compute_likelihood_ratio_test(self):
        """
        This function computes the likelihood ratio test for the Cox model. We
        compare the existing model (with all the covariates) to the trivial model
        of no covariates.

        Conveniently, we can actually use CoxPHFitter class to do most of the work.

        """

        trivial_dataset = self.start_stop_and_events.groupby(level=0).last()[["event", "stop"]]
        weights = self.weights.groupby(level=0).last()
        trivial_dataset = trivial_dataset.join(weights)

        ll_null = CoxPHFitter._trivial_log_likelihood(
            trivial_dataset["stop"].values, trivial_dataset["event"].values, trivial_dataset["__weights"].values
        )
        ll_alt = self._log_likelihood

        test_stat = 2 * (ll_alt - ll_null)
        degrees_freedom = self.hazards_.shape[1]
        _, p_value = chisq_test(test_stat, degrees_freedom=degrees_freedom, alpha=0.0)
        return test_stat, degrees_freedom, np.log(p_value)
def survival_analyze(dataframe,
                     lifetime_col,
                     dead_col,
                     strata_cols,
                     covariate_col=None):
    # Based on notebook here. https://github.com/CamDavidsonPilon/lifelines/tree/master/examples
    import pandas as pd
    from matplotlib import pyplot as plt
    from lifelines import CoxPHFitter

    cph = CoxPHFitter().fit(dataframe,
                            lifetime_col,
                            dead_col,
                            strata=strata_cols)
    cph.plot(ax=ax[1])
    if covariate_col:
        cph.plot_covariate_groups(covariate_col, values=[0, 1])
    pass
def Cox_Model(train, test):
    '''
    train: train_data
    test: test_data
    vars_list: variables list
    '''
    cph = CoxPHFitter(penalizer=15)
    cph.fit(train,
            duration_col='生存时间(天)',
            event_col='是否死亡',
            show_progress=True,
            step_size=1)
    Cox_train_Cindex = concordance_index(train['生存时间(天)'],
                                         -cph.predict_partial_hazard(train),
                                         train['是否死亡'])
    Cox_test_Cindex = concordance_index(test['生存时间(天)'],
                                        -cph.predict_partial_hazard(test),
                                        test['是否死亡'])
    return Cox_train_Cindex, Cox_test_Cindex, cph
Beispiel #15
0
def fitcoxmodel(classification, T, E, pid, verbose=True):
    # Convert the inputs to PD dataframe
    data = dict()
    data['T'] = T
    data['E'] = E
    data['Cov'] = classification
    data = pd.DataFrame(data=data, index=pid)

    # Create the COX fitter
    cph = CoxPHFitter()
    cph.fit(data, duration_col='T', event_col='E')

    if verbose:
        cph.print_summary()

    # Retreive the coefficient
    s = cph.summary
    coef = s['coef']['Cov']
    CI = [s['lower 0.95']['Cov'], s['upper 0.95']['Cov']]
    p = s['p']['Cov']

    return coef, CI, p
def main():
    # get command line arguments
    cmd_args = commandLineParser()
    # import data (mother/rain)
    rainfall_df = pd.read_csv(cmd_args.rainfall_data)
    mother_df = pd.read_csv(cmd_args.DHS_data)
    # get relevant data from rain data
    merged = pd.merge(mother_df, rainfall_df, on=['DHSID', 'Year'], how='left')
    merged.set_index('IDHSPID', inplace=True)
    # drop unneeded columns
    drop_columns = ['DHSID', 'Year', r'%-ile', 'Total Rainfall (mm)']
    for column in merged.columns:
        if column in drop_columns:
            merged.drop(column, axis=1, inplace=True)
    # change Bools into ones or zeros
    for column in [r'<5%-ile', r'<10%-ile', r'<15%-ile']:
        merged[column] = (merged[column] == True).astype(int)
    # regressions
    cph = CoxPHFitter()
    cph.fit(merged, 'Event Time', event_col='Event Occured')
    # display results
    cph.print_summary()
def coxph_smoke():
    rossi = load_rossi()

    cph = CoxPHFitter()
    cph.fit(rossi, duration_col='week', event_col='arrest')

    cph.print_summary()

    rossiH2O = h2o.H2OFrame(rossi)
    cphH2O = H2OCoxProportionalHazardsEstimator(stop_column="week")
    cphH2O.train(x=["age", "fin", "race", "wexp", "mar", "paro", "prio"], y="arrest", training_frame=rossiH2O)

    assert cphH2O.model_id != ""
    assert cphH2O.formula() == "Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio", \
        "Expected formula to be 'Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio' but it was " + cphH2O.formula()

    predH2O = cphH2O.predict(test_data=rossiH2O)
    assert len(predH2O) == len(rossi)

    metricsH2O = cphH2O.model_performance(rossiH2O)
    py_concordance = concordance_for_lifelines(cph)
    
    assert abs(py_concordance - metricsH2O.concordance()) < 0.001
Beispiel #18
0
def cox_regression_experiment():
    dynamic_features = np.load('pick_5_visit_features_merge_1.npy')[
        0:2100, :, :-2]
    dynamic_features.astype(np.int32)
    labels = np.load('pick_5_visit_labels_merge_1.npy')[:, :, -4].reshape(
        -1, dynamic_features.shape[1], 1)
    data = np.concatenate((dynamic_features, labels), axis=2).reshape(-1, 94)
    data_set = pd.DataFrame(data)
    col_list = list(data_set.columns.values)
    new_col = [str(x) for x in col_list]
    data_set.columns = new_col
    np.savetxt('allPatient_now.csv', data_set, delimiter=',')
    print(list(data_set.columns.values))
    cph = CoxPHFitter(penalizer=100)
    cph.fit(data_set, duration_col='0', event_col='93', show_progress=True)
    cph.print_summary()
    # cph.plot(columns=['15','20','21','25'])
    # plt.savefig('cox model' + '.png', format='png')

    scores = k_fold_cross_validation(cph, data_set, '0', event_col='93', k=5)
    print(scores)
    print(np.mean(scores))
    print(np.std(scores))
Beispiel #19
0
def cox_Proportional_hazard_model():
    ##################################################################
    print('---------------------------------------')
    print('Standard Cox proportional hazards model')
    print('---------------------------------------')
    '''
    Standard Cox proportional hazards model
    '''
    cph = CoxPHFitter()
    cph.fit(data_train,
            duration_col='duration_d',
            event_col='CVD',
            show_progress=True)
    # cph.print_summary()

    # Cox model discrimination train set
    prediction = cph.predict_partial_hazard(data_train)
    print("\ntrain data c-index = " + str(
        concordance_index(data_train.duration_d, -prediction, data_train.CVD)))

    # Cox model discrimination test set
    prediction = cph.predict_partial_hazard(data_test)
    print("\ntest data c-index = " + str(
        concordance_index(data_test.duration_d, -prediction, data_test.CVD)))
Beispiel #20
0
    def create_data_df(self):
        row_names_index = [(names, index) for v_conf in self.v_stack.configs
                           for (names, index) in v_conf.list_index.items()]
        col_names = self.header + ["hr", "lcl", "ucl", "p_value"]
        output_df = pd.DataFrame(columns=col_names,
                                 data=0,
                                 index=range(len(row_names_index)))

        #  loop over vconf
        #     create sub df with cols as [1:] of rows in vconf + time value + other value
        #     apply methods
        # get results
        # use results to populate outputs

        i = -1
        for v_conf in self.v_stack.configs:
            col_names = set([x for x in v_conf.args])
            name_df_dict = {}
            for col_name in col_names:
                one_hot = pd.get_dummies(self.df[col_name], drop_first=True)

                non_dropped = [x for x in one_hot.columns]
                dropped = [
                    x for x in self.df[col_name].unique()
                    if x not in non_dropped and not pd.isnull(x)
                ]
                assert len(dropped) == 1
                dropped = dropped[0]

                one_hot["_observed"] = self.df["_observed"]
                one_hot["_time_to_observation"] = self.df[
                    "_time_to_observation"]
                cph = CoxPHFitter()
                cph.fit(
                    one_hot,
                    duration_col="_time_to_observation",
                    event_col="_observed",
                    show_progress=False,
                )

                res = cph.confidence_intervals_
                res["hr"] = cph.params_
                res["lcl"] = res["95% lower-bound"]
                res["ucl"] = res["95% upper-bound"]
                res["p_value"] = cph._compute_p_values()
                res = res.drop(columns=["95% lower-bound", "95% upper-bound"])
                res = res.apply(lambda x: np.exp(x))

                old_index = res.index
                res.at[dropped, :] = 1
                res = res.reindex(index=[dropped] + list(old_index))
                name_df_dict[col_name] = res

            for (names, index) in v_conf.list_index.items():
                i += 1
                # join on names
                res_df = name_df_dict[names[0]]
                line = [x for x in names] + [x for x in res_df.loc[names[1]]]
                output_df.at[i, :] = line
            # for (names, index) in v_conf.list_index.items():
            #    (cat, group, sample_size, label) = names
            #    if group in non_dropped:
            #        cph[group] =

            # output.append([])

        for header in self.header:
            output_df[header] = output_df[header].astype(str)

        for i, (row_names, row_index) in enumerate(row_names_index):
            # set the header shit
            for header, name in zip(self.header, row_names):
                output_df.at[i, header] = name
            for h_conf in self.h_stack.configs:
                if h_conf.kind == "space":
                    name = list(h_conf.name_index.keys())[0]
                    output_df[name] = output_df[name].astype(str)
                    output_df[name] = ""
                else:
                    for col_name, col_index in h_conf.name_index.items():
                        idx = row_index & col_index
                        new_df = self.df.iloc[idx]
                        series_reducer = h_conf.reducer
                        reducer = getattr(Reducers(), series_reducer)
                        if h_conf.kind == "space":
                            output_df[name] = output_df[name].astype(str)

                        try:

                            out_val = reducer(new_df[h_conf.name])
                            output_df.at[i, col_name] = out_val
                        except KeyError:
                            output_df[col_name] = output_df[col_name].astype(
                                object)
                            print(h_conf.name)
                            output_df.at[i, col_name] = "ERROR"

        return output_df
Beispiel #21
0
# -*- coding: utf-8 -*-
# cox regression

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import CoxPHFitter
    from lifelines.datasets import load_rossi, load_regression_dataset

    reps = 1
    df = load_rossi()
    # df['s'] = "a"
    df = pd.concat([df] * reps)
    print(df.shape)

    cph = CoxPHFitter(baseline_estimation_method="spline",
                      n_baseline_knots=3,
                      strata=["wexp"])
    start_time = time.time()
    cph.fit(df,
            duration_col="week",
            event_col="arrest",
            show_progress=True,
            timeline=np.linspace(1, 60, 100))
    print(cph.score(df))
    print("--- %s seconds ---" % (time.time() - start_time))
Beispiel #22
0
# -*- coding: utf-8 -*-
# cox regression

if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import CoxPHFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 16)
    # df = df.reset_index()
    # df['week'] = np.random.exponential(1, size=df.shape[0])
    cp = CoxPHFitter()
    start_time = time.time()
    cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    cp.print_summary()
Beispiel #23
0
    ts     = ts[ np.random.choice(N, set_size, replace=True) ]
    es     = np.random.binomial(1, (1-censor_rate), set_size)

    # Create a data-frame for R:
    df = pd.DataFrame({
            'time'   : ts,
            'status' : es,
            'x1'     : np.random.uniform(-1.0, 1.0, set_size)})


    # Normalize:
    df['x1'] = (df['x1'] - df['x1'].mean()) / df['x1'].std()

    # Compute likelihood with R:
    r_out  = rfunc( df )
    preds, r_lik  = np.asarray(r_out[0]), np.negative(np.round(r_out[1][0],4))
    tf_lik_r = K.eval( efron_estimator_tf(K.variable(ts), K.variable(es), K.variable(preds)) )

    # Compute ll with Lifelines:
    cp = CoxPHFitter()
    cp.fit(df, 'time', 'status', initial_beta=np.ones((1,1))*0.543, step_size=0.0)
    preds = cp.predict_log_partial_hazard(df.drop(['time', 'status'], axis=1)).values[:, 0]
    tf_lik_lifelines = K.eval( efron_estimator_tf(K.variable(ts), K.variable(es), K.variable(preds)) )

    print( 'TensorFlow w/ R: ', tf_lik_r )
    print( 'R-survival : ', r_lik )
    print( 'TensorFlow w/ lifelines: ', tf_lik_lifelines )
    print( 'Lifelines : ', np.negative(cp._log_likelihood), end='\n\n')

# done.
Beispiel #24
0
                name.append("zhongliu__f," + zhongliu_3DT1[0][id % 1781])
        # print("name:",name[0],"\n",name[1],"\n",name[2],"\n",name[3],"\n",name[4],"\n")

        # COX
        data = {
            'T': y_train[:, 0],
            'E': y_train[:, 1],
            '%s' % name[0]: X_selected[:, 0],
            '%s' % name[1]: X_selected[:, 1],
            '%s' % name[2]: X_selected[:, 2],
            '%s' % name[3]: X_selected[:, 3],
            '%s' % name[4]: X_selected[:, 4],
        }
        df = pd.DataFrame(data)
        # cph = CoxPHFitter(penalizer=0.1, l1_ratio=1.0)
        cph = CoxPHFitter()
        cph.fit(df, duration_col='T', event_col='E')
        # c_index = COX(X_selected,y_train,name)
        train_c_index.append(cph.concordance_index_)
        print("Train_c_index:", cph.concordance_index_)
        # 预测c_index
        # data_test
        data_test = {
            'T': y_test[:, 0],
            'E': y_test[:, 1],
            '%s' % name[0]: X_test[:, ID[0] - 1],
            '%s' % name[1]: X_test[:, ID[1] - 1],
            '%s' % name[2]: X_test[:, ID[2] - 1],
            '%s' % name[3]: X_test[:, ID[3] - 1],
            '%s' % name[4]: X_test[:, ID[4] - 1],
            #  '%s' % name[0]: X_test[:, ID[0]+1],
Beispiel #25
0
def test_cross_validator_with_stratified_cox_model():
    cf = CoxPHFitter(strata=["race"])
    utils.k_fold_cross_validation(cf, load_rossi(), duration_col="week", event_col="arrest")
from lifelines.datasets import generate_regression_dataset
regression_dataset = generate_regression_dataset()
from lifelines import AalenAdditiveFitter, CoxPHFitter
cf = CoxPHFitter()
cf.fit(regression_dataset, duration_col='T', event_col='E')
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, duration_col='T', event_col='E')
x = regression_dataset[regression_dataset.columns - ['E','T']]
aaf.predict_survival_function(x.ix[10:12]).plot()
aaf.plot()
import pandas as pd
from lifelines import WeibullAFTFitter, CoxPHFitter

# This is an implementation of https://uwspace.uwaterloo.ca/bitstream/handle/10012/10265/Cook_Richard-10265.pdf

N = 50000
p = 0.5
bX = np.log(0.5)
bZ = np.log(4)

Z = np.random.binomial(1, p, size=N)
X = np.random.binomial(1, 0.5, size=N)
X_ = 20000 + 10 * np.random.randn(N)

W = weibull_min.rvs(1, scale=1, loc=0, size=N)

Y = bX * X + bZ * Z + np.log(W)
T = np.exp(Y)

#######################################

df = pd.DataFrame({"T": T, "x": X, "x_": X_})


wf = WeibullAFTFitter().fit(df, "T")
wf.print_summary(4)


cph = CoxPHFitter().fit(df, "T", show_progress=True, step_size=1.0)
cph.print_summary(4)
Beispiel #28
0
# Convert to data frame
data = pd.DataFrame({'duration': duration, 'event': not_censor, 'age': age, 'college': college})

# Plot observations with censoring
# plot_lifetimes(duration, event_observed = not_censor)

# Kaplan Meier Summary for Simulated Data
from lifelines import KaplanMeierFitter
kmf =  KaplanMeierFitter()
kmf.fit(duration, event_observed = not_censor)
kmf.survival_function_.plot()

# Cox-PH Model Regression
from lifelines import CoxPHFitter
cf = CoxPHFitter()
cf.fit(data, 'duration', event_col = 'event')
cf.print_summary()

## Get Predictions from Model ##

# 24 year old college grad
#college_24 = pd.DataFrame({'age':[24], 'college':[1]})
#cf.predict_survival_function(college_24).plot()

# 65 year old high school grad
#hs_65 = pd.DataFrame({'age':[65], 'college':[0]})
#cf.predict_survival_function(hs_65).plot()

# Predicted Survival for 24yr-old College Grad and 65yr-old HS Grad
mixed = pd.DataFrame({'age':[24, 65,42], 'college':[1,0,.4], 'index': ['24yr old College Grad','65yr old HS Grad','Average']})
tx = df['history_of_neoadjuvant_treatment']=='Yes'
ax = plt.subplot(111)

kmf1 = KaplanMeierFitter(alpha=0.95)
kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes'])
kmf1.plot(ax=ax, show_censors=True,  ci_show=False)


kmf2 = KaplanMeierFitter(alpha=0.95)
kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No'])
kmf2.plot(ax=ax, show_censors=True,  ci_show=False )

add_at_risk_counts(kmf1, kmf2, ax=ax)
plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx')
plt.xlabel(survival_col)
plt.savefig('km.png')

results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 )
results.print_summary()

cox = CoxPHFitter(normalize=False)
df_age = df[[survival_col, censor_col, 'age_at_initial_pathologic_diagnosis']]
df_age = df_age[pd.notnull(df_age['age_at_initial_pathologic_diagnosis'])]
cox = cox.fit(df_age, survival_col, event_col=censor_col, include_likelihood=True)
cox.print_summary()

scores = k_fold_cross_validation(cox, df_age, survival_col, event_col=censor_col, k=10)
print scores
print 'Mean score', np.mean(scores)
print 'Std', np.std(scores)
 
Beispiel #30
0
def multivariate(df):
    from lifelines import CoxPHFitter
    cph = CoxPHFitter()
    cph.fit(df, duration_col='time', event_col='status',
            show_progress=True)
    cph.print_summary()  # access the results using cph.summary
#Survival Regression

from lifelines.datasets import load_regression_dataset
regression_dataset = load_regression_dataset()

regression_dataset.head()






from lifelines import AalenAdditiveFitter, CoxPHFitter

# Using Cox Proportional Hazards model
cf = CoxPHFitter()
cf.fit(regression_dataset, 'T', event_col='E')
cf.print_summary()

# Using Aalen's Additive model
aaf = AalenAdditiveFitter(fit_intercept=False)
aaf.fit(regression_dataset, 'T', event_col='E')






x = regression_dataset[regression_dataset.columns - ['E','T']]
aaf.predict_survival_function(x.ix[10:12]).plot() #get the unique survival functions of the first two subjects
Beispiel #32
0
    viz.plot_survival_curves(experiment_name = 'RSF', output_file=output_file, **rec_dict)

if __name__ == '__main__':
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    args = parse_args()
    print("Arguments:",args)

    # Load Dataset
    print("Loading datasets: " + args.dataset)
    datasets = utils.load_datasets(args.dataset)

    # Train CPH model
    print("Training CPH Model")
    train_df = utils.format_dataset_to_df(datasets['train'], DURATION_COL, EVENT_COL)
    cf = CoxPHFitter()
    results = cf.fit(train_df, duration_col=DURATION_COL, event_col=EVENT_COL, 
        include_likelihood=True)
    cf.print_summary()
    print("Train Likelihood: " + str(cf._log_likelihood))

    if 'valid' in datasets:
        metrics = evaluate_model(cf, datasets['valid'])
        print("Valid metrics: " + str(metrics))

    if 'test' in datasets:
        metrics = evaluate_model(cf, datasets['test'], bootstrap=True)
        print("Test metrics: " + str(metrics))

    print("Saving Visualizations")
    if 'test' in datasets and args.treatment_idx is not None:
Beispiel #33
0
 def __init__(self):
     random.seed(0)
     super(CoxPH, self).__init__(CoxPHFitter(), self.__class__.__name__)
Beispiel #34
0
def _plot_kmf_single(df,
                     condition_col,
                     survival_col,
                     censor_col,
                     threshold,
                     title,
                     xlabel,
                     ylabel,
                     ax,
                     with_condition_color,
                     no_condition_color,
                     with_condition_label,
                     no_condition_label,
                     color_map,
                     label_map,
                     color_palette,
                     ci_show,
                     print_as_title):
    """
    Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col.

    All inputs are required - this function is intended to be called by `plot_kmf`.
    """
    # make color inputs consistent hex format
    if colors.is_color_like(with_condition_color):
        with_condition_color = colors.to_hex(with_condition_color)
    if colors.is_color_like(no_condition_color):
        no_condition_color = colors.to_hex(no_condition_color)
    ## prepare data to be plotted; producing 3 outputs:
    # - `condition`, series containing category labels to be plotted
    # - `label_map` (mapping condition values to plot labels)
    # - `color_map` (mapping condition values to plotted colors)
    if threshold is not None:
        is_median = threshold == "median"
        if is_median:
            threshold = df[condition_col].median()
        label_suffix = float_str(threshold)
        condition = df[condition_col] > threshold
        default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix)
        if is_median:
            label_suffix += " (median)"
        default_label_with_condition = "%s > %s" % (condition_col, label_suffix)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category":
        condition = df[condition_col].astype("category")
        if not label_map:
            label_map = dict()
            [label_map.update({condition_value: '{} = {}'.format(condition_col,
                                                        condition_value)})
                     for condition_value in condition.unique()]
        if not color_map:
            rgb_values = sb.color_palette(color_palette, len(label_map.keys()))
            hex_values = [colors.to_hex(col) for col in rgb_values]
            color_map = dict(zip(label_map.keys(), hex_values))
    elif df[condition_col].dtype == 'bool':
        condition = df[condition_col]
        default_label_with_condition = "= {}".format(condition_col)
        default_label_no_condition = "¬ {}".format(condition_col)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    else:
        raise ValueError('Don\'t know how to plot data of type\
                         {}'.format(df[condition_col].dtype))

    # produce kmf plot for each category (group) identified above
    kmf = KaplanMeierFitter()
    grp_desc = list()
    grp_survival_data = dict()
    grp_event_data = dict()
    grp_names = list(condition.unique())
    for grp_name, grp_df in df.groupby(condition):
        grp_survival = grp_df[survival_col]
        grp_event = (grp_df[censor_col].astype(bool))
        grp_label = label_map[grp_name]
        grp_color = color_map[grp_name]
        kmf.fit(grp_survival, grp_event, label=grp_label)
        desc_str = "# {}: {}".format(grp_label, len(grp_survival))
        grp_desc.append(desc_str)
        grp_survival_data[grp_name] = grp_survival
        grp_event_data[grp_name] = grp_event
        if ax:
            ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color)
        else:
            ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color)

    ## format the plot
    # Set the y-axis to range 0 to 1
    ax.set_ylim(0, 1)
    y_tick_vals = ax.get_yticks()
    ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals])
    # plot title
    if title:
        ax.set_title(title)
    elif print_as_title:
        ax.set_title(' | '.join(grp_desc))
    else:
        [print(desc) for desc in grp_desc]
    # axis labels
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    
    ## summarize analytical version of results
    ## again using same groups as are plotted
    if len(grp_names) == 2:
        # use log-rank test for 2 groups
        results = logrank_test(grp_survival_data[grp_names[0]],
                               grp_survival_data[grp_names[1]],
                               event_observed_A=grp_event_data[grp_names[0]],
                               event_observed_B=grp_event_data[grp_names[1]])
    elif len(grp_names) == 1:
        # no analytical result for 1 or 0 groups
        results = NullSurvivalResults()
    else:
        # cox PH fitter for >2 groups
        cf = CoxPHFitter()
        cox_df = patsy.dmatrix('+'.join([condition_col, survival_col,
                                         censor_col]),
                               df, return_type='dataframe')
        del cox_df['Intercept']
        results = cf.fit(cox_df, survival_col, event_col=censor_col)
        results.print_summary()
    # add metadata to results object so caller can print them
    results.survival_data_series = grp_survival_data
    results.event_data_series = grp_event_data
    results.desc = grp_desc
    return results
Beispiel #35
0
def main(filter_gap_days, rp_vs_radiation):
    # get RP procedure date
    with open('../data/empi_to_rp_date_dic.pkl', 'rb') as handle:
        empi_to_rp_date_dic = pickle.load(handle)

    # if rp_vs_radiation:
    # get biopsy date
    with open('../data/empi_to_date_oi_dic.pkl', 'rb') as handle:
        empi_to_date_oi_dic = pickle.load(handle)
    # rp date min : 1992-8-5, rp date max : 2020-7-29
    # rp_date_range = [np.arange(1992, 1998), np.arange(1998, 2004), np.arange(2004, 2009), np.arange(2009, 2015), np.arange(2015, 2021)]

    # pre-process the data using radiation only patients / multiple RP patients
    df_first_date_rads = pd.read_csv(
        '../data/processed_data/first_date_rads.csv')
    df_first_date_rads.set_index('EMPI', inplace=True)
    for empi, pr_date in zip(df_first_date_rads.index,
                             df_first_date_rads.prdate_parsed.values):
        if empi in empi_to_date_oi_dic.keys():
            df_first_date_rads.at[empi, 'pr_date_minus_biopsy_date'] = (
                pd.to_datetime(pr_date) - empi_to_date_oi_dic[empi]).days
    df_first_date_rads_filtered = df_first_date_rads.loc[
        df_first_date_rads.pr_date_minus_biopsy_date > 0]
    df_first_date_rads_filtered = df_first_date_rads_filtered.loc[
        df_first_date_rads_filtered.pr_date_minus_biopsy_date <=
        filter_gap_days]
    radiation_empis = set(df_first_date_rads_filtered.index)

    df_multirp = pd.read_csv('../data/processed_data/multirp.csv')
    df_multirp.set_index('EMPI', inplace=True)
    multirp_empis = set(df_multirp.index)
    # breakpoint()

    # get relevant data
    df_merged_comorb = pd.read_csv(
        '../data/merged/df_merged_biopsy_based_C.csv')
    df_merged_comorb.set_index(df_merged_comorb.columns[0], inplace=True)
    df_merged_comorb.index.name = 'EMPI'

    df_merged_psa_prior = pd.read_csv(
        '../data/merged/df_merged_biopsy_based_E.csv')
    df_merged_psa_prior.set_index(df_merged_psa_prior.columns[0], inplace=True)
    df_merged_psa_prior.index.name = 'EMPI'

    df_outcome_oi = pd.read_csv(
        '../data/df_outcome_final_biopsy_based_clean.csv')
    df_outcome_oi.set_index('EMPI', inplace=True)

    # filter_gap_days = 60
    df_outcome_oi_filtered_rp_postive = df_outcome_oi.loc[
        df_outcome_oi.rp_date_minus_biopsy_date_in_days <= filter_gap_days]
    df_outcome_oi_rp_negative = df_outcome_oi.loc[
        df_outcome_oi.rp_date.isnull()]
    if rp_vs_radiation:
        # exclude both RP and Radiation
        both_rp_radiation_empis = set(
            df_outcome_oi_filtered_rp_postive.index) & radiation_empis
        df_outcome_oi_filtered_rp_postive = df_outcome_oi_filtered_rp_postive.loc[
            set(df_outcome_oi_filtered_rp_postive.index) -
            both_rp_radiation_empis]
        df_outcome_oi_rp_negative = df_outcome_oi_rp_negative.loc[
            set(df_outcome_oi_rp_negative.index)
            & radiation_empis - multirp_empis]
        df_outcome_oi_filtered_total = pd.concat(
            [df_outcome_oi_filtered_rp_postive, df_outcome_oi_rp_negative])
    else:  # treated (radiation and rp) vs AS
        df_outcome_oi_treated_postive = pd.concat([
            df_outcome_oi_filtered_rp_postive,
            df_outcome_oi_rp_negative.loc[set(df_outcome_oi_rp_negative.index)
                                          & radiation_empis]
        ])
        # df_outcome_oi_filtered_rp_postive
        df_outcome_oi_treated_negative = df_outcome_oi_rp_negative.loc[
            set(df_outcome_oi_rp_negative.index) - radiation_empis]
        df_outcome_oi_filtered_total = pd.concat(
            [df_outcome_oi_treated_postive, df_outcome_oi_treated_negative])

    # filter out negative time to death
    df_outcome_oi_filtered_final = df_outcome_oi_filtered_total.loc[
        df_outcome_oi_filtered_total.time_to_death_in_month > 0]
    # df_outcome_oi_filtered_final.set_index('EMPI', inplace = True)

    # merge comorbidity data and psa prior data
    empi_common = set(df_merged_comorb.index) & set(df_merged_psa_prior.index)
    df_merged_psa_comorb = pd.concat([
        df_merged_comorb.loc[empi_common],
        df_merged_psa_prior.loc[empi_common].psa_prior_to_rp
    ],
                                     axis=1)
    df_merged_psa_comorb.drop(columns=['wscore_agg'], inplace=True)

    # merge outcome and feature dfs
    empi_common_outcome = set(df_outcome_oi_filtered_final.index) & set(
        df_merged_psa_comorb.index)
    outcome_cols_oi = ['death_ind', 'time_to_death_in_month']
    df_cox = pd.concat([
        df_merged_psa_comorb.loc[empi_common_outcome],
        df_outcome_oi_filtered_final.loc[empi_common_outcome][outcome_cols_oi]
    ],
                       axis=1)

    if not rp_vs_radiation:
        df_cox.loc[df_cox.index.isin(df_outcome_oi_treated_postive.index),
                   'rp_indicator'] = 1
        df_cox.rename(columns={'rp_indicator': 'treated'}, inplace=True)
    # get biopsy date range
    # biopsy_date_list = []
    # for empi in data_merged.index:
    # 	biopsy_date = empi_to_rp_date_dic[empi]
    # 	for yr_idx, yr_range in enumerate(rp_date_range):
    # 		if biopsy_date.year in yr_range:
    # 			biopsy_date_list.append(yr_idx)
    # 			break
    # data_merged['rp_date'] = biopsy_date_list

    # drop uninformative and extreme minority features
    drop_cols = ['benign', 'Unknown/other']  # pre-filtering
    # remove metastatic cancer patients
    df_cox = df_cox.loc[df_cox.metacanc_agg == 0]
    # drop_cols_non_informative = ['cevd_agg', 'rheumd_agg', 'pud_agg', 'mld_agg', 'diabwc_agg', 'aids_agg', 'metacanc_agg']
    if rp_vs_radiation:
        drop_cols_non_informative = [
            'metacanc_agg', 'rheumd_agg', 'copd_agg', 'mld_agg', 'diabwc_agg',
            'hp_agg', 'rend_agg', 'aids_agg'
        ]
    else:
        drop_cols_non_informative = [
            'metacanc_agg', 'cevd_agg', 'rheumd_agg', 'mld_agg', 'diabwc_agg',
            'hp_agg', 'aids_agg'
        ]  #, 'rheumd_agg', 'copd_agg', 'mld_agg', 'diabwc_agg', 'hp_agg', 'rend_agg', 'aids_agg']
    df_cox_final = df_cox.drop(columns=drop_cols + drop_cols_non_informative)
    # standardize age and auxiiliary_mci_score,psa_prior_to_rp
    standardize_cols = ['Age at RP', 'auxiiliary_mci_score', 'psa_prior_to_rp']
    for col in standardize_cols:
        if col == 'psa_prior_to_rp':
            breakpoint()
        df_cox_final[col] = (df_cox_final[col].values - np.mean(
            df_cox_final[col])) / np.std(df_cox_final[col].values)

    # df_cox_final = df_cox_final.loc[df_cox_final.overall_grade_merged == 1]
    # df_cox_final.drop(columns = ['overall_grade_merged'], inplace = True)
    # df_cox_final = df_cox_final.loc[df_cox_final.overall_grade_merged > 1]
    # df_cox_final.drop(columns = ['overall_grade_merged'], inplace = True)

    print('Final cox df stats : ')
    print(df_cox_final.sum())

    cph = CoxPHFitter(penalizer=0.00, l1_ratio=0)
    cph.fit(df_cox_final,
            'time_to_death_in_month',
            'death_ind',
            show_progress=False,
            step_size=0.1)
    cph.print_summary()

    breakpoint()

    # rename colums for downstraem task compatibility
    df_cox_final.rename(columns={
        'death_ind': 'death',
        'rp_indicator': 'rp',
        'time_to_death_in_month': 'survtime'
    },
                        inplace=True)
    if rp_vs_radiation:
        df_cox_final.to_csv(
            '../data/df_cox_data_death_causal_inference_rp_vs_radiation.csv')
    else:
        df_cox_final.to_csv('../data/df_cox_data_death_causal_inference.csv')
    breakpoint()
    return
Beispiel #36
0
    def run_coxph(self,
                  vcf_path,
                  phenotype_path,
                  output_path,
                  id_column,
                  covar_columns,
                  event_column,
                  time_column,
                  chrom=None,
                  start=None,
                  end=None):
        def parse_phenotypes(path, id_column, covar_columns, event_column,
                             time_column):
            df = pd.read_csv(path)
            df = df.set_index(id_column)
            columns_to_keep = covar_columns + [event_column, time_column]
            df = df[columns_to_keep]
            return df

        def gt_types_convert(gt_types):
            gt_types = gt_types.astype(float)
            # gt_types is array of 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
            gt_types[gt_types == 2] = np.nan  # missing
            gt_types[gt_types == 3] = 2  # hom_alt
            return gt_types

        df_pheno = parse_phenotypes(phenotype_path, id_column, covar_columns,
                                    event_column, time_column)

        cph = CoxPHFitter()
        vcf = VCF(vcf_path)
        sample_list = vcf.samples
        # TODO: handle partial information (e.g. chr only)
        region = ''
        if (chrom) and (start) and (end):
            region = f'{chrom}:{start}-{end}'
        f = open(output_path, "a")
        hdr = 'rsid,chr,start,end,ref,alt,maf,hr,lower_ci,upper_ci,se,z,p'
        f.write(hdr + '\n')

        for v in vcf(region):
            ref = v.REF
            alt = ''.join(v.ALT)  # assuming biallelics, which is a sin
            genotypes_numeric = gt_types_convert(v.gt_types)

            df_geno = pd.DataFrame({
                'id': sample_list,
                'genotype': genotypes_numeric
            })
            df_geno = df_geno.set_index('id')
            df = pd.merge(df_geno, df_pheno, left_index=True, right_index=True)
            df = df.dropna()
            try:
                fit = cph.fit(df,
                              event_col=event_column,
                              duration_col=time_column)
                genotype_fit = fit.summary.loc['genotype']
                f.write(','.join(
                    str(x) for x in [
                        v.ID, v.CHROM, v.start, v.end, ref, alt, v.aaf,
                        genotype_fit["exp(coef)"], genotype_fit["lower 0.95"],
                        genotype_fit["upper 0.95"], genotype_fit["se(coef)"],
                        genotype_fit["z"], genotype_fit["p"]
                    ]) + '\n')
            except ValueError as e:
                print(f'{v.start} {v.end} {v.ID} failed, {str(e)}')
        f.close()
Beispiel #37
0
def test_cross_validator_with_predictor():
    cf = CoxPHFitter()
    results = utils.k_fold_cross_validation(
        cf, load_regression_dataset(), duration_col="T", event_col="E", k=3, predictor="predict_expectation"
    )
    assert len(results) == 3
from lifelines.datasets import load_rossi
from lifelines import CoxPHFitter

import pdb

rossi_dataset = load_rossi()
cph = CoxPHFitter()
cph.fit(rossi_dataset, duration_col='week', event_col='arrest')

pdb.set_trace()
    return one_hot_df


to_encode = ['edema', 'stage']

one_hot_train = to_one_hot(df_train, to_encode)
one_hot_val = to_one_hot(df_val, to_encode)
one_hot_test = to_one_hot(df_test, to_encode)

print(one_hot_val.columns.tolist())
print(f"There are {len(one_hot_val.columns)} columns")

print(one_hot_train.shape)
one_hot_train.head()

cph = CoxPHFitter()
cph.fit(one_hot_train, duration_col='time', event_col='status', step_size=0.1)

cph.print_summary()

cph.plot_covariate_groups('edema_1.0', values=[0, 1])


def hazard_ratio(case_1, case_2, cox_params):

    hr = np.exp(np.dot(cox_params, (case_1 - case_2)))

    return hr


i = 1
# This hazard curve shows us that there is low hazard of someone leaving starting off, then it gets worse,
# once you stay for 500 days you stay at least a bit more, then exponentially it gets worse!

# SURVIVAL REGRESSION -- figuring out the influences of other aspects on whether or not someone survives
# Can't use regular linear regression. Want to use Cox's model or Aalen's additive model.

# Cox's Proportional Hazard model
# "The idea behind the model is that the log-hazard of an individual is a linear function of their static covariates
# and a population-level baseline hazard that changes over time" - from https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html

from lifelines.datasets import load_rossi
from lifelines import CoxPHFitter

rossi_dataset = load_rossi()
cph = CoxPHFitter()
cph.fit(rossi_dataset,
        duration_col='week',
        event_col='arrest',
        show_progress=True)

cph.print_summary()
rossi_dataset.info()
rossi_dataset.sample(20)

# Try this with our data
# First have to make categorical columns into number columns and get rid of columns we don't want for regression
data2.head()
# Get rid of join_date, quit_date, event_observed, employee_id and make sure company_id and dept are categorical so they get dummified
data2 = data2.drop(['employee_id', 'join_date', 'quit_date'], axis=1)
data3 = pd.get_dummies(data2)
Beispiel #41
0
plt.scatter(toy_none.x.values, np.log(toy_none['T'].values), alpha=0.25)
plt.scatter(toy_lin.x.values, np.log(toy_lin['T'].values), alpha=0.25)
plt.scatter(toy_sq.x.values, np.log(toy_sq['T'].values), alpha=0.25)
plt.show()

# *** Survival is random w.r.t. X input values ***

## run a set of CoxPH fits, generate a boxplot of the outcomes
c_index_list = []
for j in range(100):
    # potential X value, but unrelated in this case
    x = np.random.poisson(lam=10, size=3000)

    toy_dataset = gen_survival(x, x_relation=None)

    cph = CoxPHFitter()
    cph.fit(toy_dataset,
            duration_col='T',
            event_col='event_obs',
            show_progress=False)
    c_index_list.append(cph.score_)

# *** Survival is _linear_ w.r.t. X input values ***

c_index_list_lin = []
for j in range(100):
    # potential X value, but unrelated in this case
    x = np.random.poisson(lam=10, size=3000)

    toy_dataset = gen_survival(x, x_relation='lin')
Beispiel #42
0

"""

# print cancer['T'].unique()
# print cancer['E'].unique()
# cancer = cancer.dropna()


# the '-1' term
# refers to not adding an intercept column (a column of all 1s).
# It can be added to the Fitter class.

covMatrix = cancer.cov()

cf = CoxPHFitter()
cf.fit(covMatrix, "T", event_col="E")  # extra paramater for categorical , strata=catVar
cf.print_summary()

curve = cf.predict_survival_function(cancer)
curve.plot()
plt.show()
print "hazard coeff", cf.hazards_
print "baseline ", cf.baseline_hazard_

"""
scores = k_fold_cross_validation(cf, covMatrix, 'T', event_col='E', k=3)
print scores
print np.mean(scores)
print np.std(scores)