def comp_cph(endpoint, sex, df_events, df_info): """Prepare data and fit a Cox PH model for the given endpoint""" logger.info(f"{endpoint} - {sex} - Computing cumulative incidence") logger.debug(f"{endpoint} - {sex} - Assigning cases and controls") # Cases df_cases = df_events.loc[df_events.ENDPOINT == endpoint, ["FINNGENID", "ENDPOINT_AGE"]] if df_cases.shape[0] < MIN_CASES: raise NotEnoughCases(f"Not enough cases (< {MIN_CASES}).") # Take all individual, also dealing with sex-specific endpoints df_all = df_info.loc[df_info.SEX == sex, ["FINNGENID", "FU_END_AGE"]] df_all = df_all.merge(df_cases, how="left", on="FINNGENID") df_all["outcome"] = ~df_all.ENDPOINT_AGE.isna( ) # ENDPOINT_AGE is NaN for controls df_all["duration"] = df_all.FU_END_AGE df_all.loc[df_all.outcome, "duration"] = df_all.loc[df_all.outcome, "ENDPOINT_AGE"] # Trim down the columns so the later call to cph.fit() doesn't try to use extra columns dfcox = df_all.loc[:, ["outcome", "duration"]] logger.debug(f"{endpoint} - Fitting Cox model") cph = CoxPHFitter() cph.fit(dfcox, duration_col="duration", event_col="outcome") return dfcox, cph
def main(data_df): for key in th_dict.keys(): if not key.find("HU") > 0: data_df[key] = data_df[key].fillna(0) data_df[key] = data_df[key].map(lambda input: 1 if input >= th_dict[key] else 0) add_DF = pd.DataFrame() add_DF["V-HU"] = data_df['HU_of_consolidation'] + data_df[ 'Volume_of_total_pneumonia_infection'] #0,1,2 combinations_df = pd.concat( [ data_df["Duration"], data_df["Death"], data_df["Age"], data_df["Blood_Oxygen"], data_df["C-Reactive_protein"], #data_df["White_blood_cell_count"] , data_df["Lymphocyte_count"], data_df["Cerebrovascular_Disease"], data_df["Sex"], #data_df["Neutrophil_count"], #data_df["D-dimer"] , data_df["Lactic_dehydrogenase"], add_DF["V-HU"], ], axis=1) cph = CoxPHFitter() cph.fit(combinations_df, "Duration", event_col="Death", step_size=0.01) cph.print_summary()
def f(train,threshold,test): hi=h(train) h_score=pd.DataFrame(hi, index=np.array(range(1,21149))) gene_ls=h_score.index[h_score.iloc[:,0]>1].tolist() candidate_genes=['V{0}'.format(element) for element in gene_ls] # qualified genes were selected stdsc = preprocessing.StandardScaler() np_scaled_train = stdsc.fit_transform(train.loc[:,candidate_genes]) np_scaled_test = stdsc.transform(test.loc[:,candidate_genes]) pca = sklearnPCA(n_components=1) X_train_pca = pca.fit_transform(np_scaled_train) # This is the result X_test_pca = pca.transform(np_scaled_test) eigen_val=pca.explained_variance_ #eigen value is the explained variance #assign pca score to the test dataset test=test.assign(w=pd.Series(np.ones(len(test.patient_id)))) test['w']=X_test_pca testset_surv=test[['event_free_survival_time_days','death','w']] #do cox-regression # Using Cox Proportional Hazards model cph = CoxPHFitter() cph.fit(testset_surv,'event_free_survival_time_days',event_col='death') return cph.print_summary()
def coxcalc(df, x, survivaltime, status): df5 = df[[status, survivaltime, x]] df5[x] = pd.to_numeric(df5[x]) df5 = df5.dropna() cph = CoxPHFitter() cph.fit(df5, duration_col=survivaltime, event_col=status, show_progress=False) return cph.summary
def test_proportional_hazard_test_with_weights(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1) ) c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w) cox.zph(c, transform='rank') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], }) df["E"] = True cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w") results = stats.proportional_hazard_test( cph, df, time_transform=["km", "rank", "log", "identity"]) results.print_summary(5) npt.assert_allclose(results.summary.loc["var1", "rank"]["test_statistic"], 0.108, rtol=1e-2)
def mpss_ph_lifelines(self): """ Performs proportional hazards regression using lifelines package. :return: feature importance """ x_train = pd.DataFrame(self.x_train) # Remove any feature columns that are all 0 values, otherwise cannot run regression lifelines_dataset = x_train.loc[:, (x_train != 0).any(axis=0)] # Reformat for lifelines package lifelines_dataset['scores'] = self.scores lifelines_dataset['event'] = 1 # Run proportional hazards regression cph = CoxPHFitter(penalizer=5, alpha=1) cph.fit(lifelines_dataset, duration_col='scores', event_col='event') # Dataframe with coefficients, absolute value of coefficients, and p-values importance = cph.summary.reset_index()[['covariate', 'coef', 'p']] importance['feature'] = importance['covariate'] importance['coef_abs'] = importance['coef'].apply( lambda x: math.fabs(x)) # Sort feature importance importance = importance.sort_values( 'coef_abs', ascending=False).reset_index(drop=True) return importance
def _compute_likelihood_ratio_test(self): """ This function computes the likelihood ratio test for the Cox model. We compare the existing model (with all the covariates) to the trivial model of no covariates. Conveniently, we can actually use another class to do most of the work. """ trivial_dataset = self.start_stop_and_events.groupby(level=0).last()[[ "event", "stop" ]] weights = self.weights.groupby(level=0).last()[["__weights"]] trivial_dataset = trivial_dataset.join(weights) cp_null = CoxPHFitter() cp_null.fit(trivial_dataset, "stop", "event", weights_col="__weights", show_progress=False) ll_null = cp_null._log_likelihood ll_alt = self._log_likelihood test_stat = 2 * ll_alt - 2 * ll_null degrees_freedom = self.hazards_.shape[1] _, p_value = chisq_test(test_stat, degrees_freedom=degrees_freedom, alpha=0.0) return test_stat, degrees_freedom, np.log(p_value)
def c_index_multiple_from_python(matrix, isdead, nbdays, matrix_test, isdead_test, nbdays_test, isfactor=False): """ """ frame = pd.DataFrame(matrix) frame["isdead"] = isdead frame["nbdays"] = nbdays frame_test = pd.DataFrame(matrix_test) frame_test["isdead"] = isdead_test frame_test["nbdays"] = nbdays_test cph = CoxPHFitter() try: with warnings.catch_warnings(): warnings.simplefilter("ignore") cph.fit(frame, "nbdays", "isdead") except Exception as e: print(e) return np.nan cindex = cph.score(frame_test, scoring_method="concordance_index") return cindex
def DoFeatureSelectionCPH(self, x, c, s, xnames, fold, sel_f_num, dev_index): variance_th = 0.15 xdf = pd.DataFrame(x, columns=xnames) sel_idx = xdf.std() > variance_th #true or false xdf = xdf.loc[:, sel_idx] xnames = xnames[sel_idx] x = xdf.values gene_p_value = [] for i in tqdm(range(0, x.shape[1])): subset_num = i cph_h_trn_stack = np.column_stack( (x[:, subset_num:subset_num + 1], c, s)) cph_cols = xnames.copy().tolist()[subset_num:subset_num + 1] cph_cols.append('E') cph_cols.append('S') cph_train_df = pd.DataFrame(cph_h_trn_stack, columns=cph_cols) cph = CoxPHFitter() cph.fit(cph_train_df, duration_col='S', event_col='E', step_size=0.1, show_progress=False) f_scores = pd.DataFrame(cph.summary)['p'].values gene_p_value.append(f_scores[0]) gene_p_value = np.asarray(gene_p_value) sort_idx = np.argsort(gene_p_value) f_name_sort = np.asarray(xnames)[sort_idx] f_score_sort = gene_p_value[sort_idx] return sort_idx, f_name_sort, f_score_sort #, auc
def test_proportional_hazard_test_with_weights_and_strata(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1), "s" = c(1, 1, 0, 0, 0) ) c = coxph(formula=Surv(T, E) ~ var1 + strata(s), data=df, weights=w) cz = cox.zph(c, transform='identity') """ df = pd.DataFrame( { "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], "s": [1, 1, 0, 0, 0], } ) df["E"] = True cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w", strata="s", robust=True) results = stats.proportional_hazard_test(cph, df, time_transform="identity") npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0283, rtol=1e-3)
def test_coxph_plot_covariate_groups_with_multiple_variables(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest") cp.plot_covariate_groups(["age", "prio"], [[10, 0], [50, 10], [80, 90]]) self.plt.title("test_coxph_plot_covariate_groups_with_multiple_variables") self.plt.show(block=block)
def test_proportional_hazard_test_with_kmf_with_some_censorship(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 0, 1) ) c = coxph(formula=Surv(T, E) ~ var1 , data=df) cox.zph(c, transform='km') """ df = pd.DataFrame( { "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "E": [1, 1, 1, 0, 1], } ) cph = CoxPHFitter() cph.fit(df, "T", "E") results = stats.proportional_hazard_test(cph, df) npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 1.013802, rtol=1e-3)
def test_proportional_hazard_test_with_kmf_with_some_censorship_and_weights(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 0, 1), "w" = c(1, 0.5, 2, 1, 1), ) c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w) cox.zph(c, transform='km') """ df = pd.DataFrame( { "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "E": [1, 1, 1, 0, 1], "w": [1, 0.5, 5, 1, 1], } ) cph = CoxPHFitter() with pytest.warns(StatisticalWarning, match="weights are not integers"): cph.fit(df, "T", "E", weights_col="w") results = stats.proportional_hazard_test(cph, df) npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.916, rtol=1e-2)
def getHazardRatio(df_col, os, event, genename, value, binary=False, age=None, return_sign=False): cph = CoxPHFitter() os_data = pd.DataFrame({'Gene': df_col, 'Duration': os, 'Flag': event}) if age is not None: os_data['Age'] = age try: cph.fit(os_data, 'Duration', 'Flag', show_progress=False) except ValueError: print('Not working, returning nans') return genename, value, np.nan, df_col.sum() hazard_ratio = np.exp(cph.hazards_['Gene'].values) if binary: if hazard_ratio < 1: hazard_ratio = 1/hazard_ratio value = 1 if return_sign: return genename, value, hazard_ratio[0], df_col.sum() else: return hazard_ratio
def survival(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []): """ duration_col: survival time event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes other_cols: other variables to consider in the regression """ phenotype_df = phenotype_df.T phenotype_df = phenotype_df.join(row.astype(float)) phenotype_df[duration_col] = phenotype_df[duration_col].astype(float) phenotype_df[event_col] = phenotype_df[event_col].astype(int) # The following lines deal with char conflicts in patsy formulas duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_') event_col = event_col.replace(' ','_').replace('.','_').replace('-','_') other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols] row.name = row.name.replace(' ','_').replace('.','_').replace('-','_') phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns] formula = row.name + ' + ' + duration_col + ' + ' + event_col if not not other_cols: other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols] formula = formula + ' + ' + ' + '.join(other_cols) X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe') X = X.drop(['Intercept'], axis = 1) cph = CoxPHFitter() cph.fit(X, duration_col = duration_col, event_col = event_col) result = cph.summary.loc[row.name] return result
def test_coxph_plot_partial_effects_on_outcome_with_multiple_variables(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest") cp.plot_partial_effects_on_outcome(["age", "prio"], [[10, 0], [50, 10], [80, 90]]) self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_multiple_variables") self.plt.show(block=block)
def test_coxph_plot_partial_effects_on_outcome_with_cumulative_hazard(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest") cp.plot_partial_effects_on_outcome("age", [10, 50, 80], y="cumulative_hazard") self.plt.title("test_coxph_plot_partial_effects_on_outcome") self.plt.show(block=block)
def test_coxph_plotting_with_subset_of_columns(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(columns=["var1", "var2"]) self.plt.title("test_coxph_plotting_with_subset_of_columns") self.plt.show(block=block)
def test_coxph_plot_covariate_groups_with_single_strata(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest", strata="paro") cp.plot_covariate_groups("age", [10, 50, 80]) self.plt.title("test_coxph_plot_covariate_groups_with_strata") self.plt.show(block=block)
def test_coxph_plotting(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot() self.plt.title("test_coxph_plotting") self.plt.show(block=block)
def test_coxph_plotting_with_hazards_ratios(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(hazard_ratios=True) self.plt.title("test_coxph_plotting") self.plt.show(block=block)
def test_coxph_plot_covariate_groups(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest") cp.plot_covariate_groups("age", [10, 50, 80]) self.plt.title("test_coxph_plot_covariate_groups") self.plt.show(block=block)
def main(): # Load data print('Load data...') hp = Hyperparameters() data = np.load('../' + hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') print('Use all data for model fitting...') x = data['x'] time = data['time'] event = data['event'] cols_list = load_obj('../' + hp.data_pp_dir + 'cols_list.pkl') df = pd.DataFrame(x, columns=cols_list) df['TIME'] = time df['EVENT'] = event ################################################################### print('Add additional columns...') df_index_code = feather.read_dataframe('../' + hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather') df_index_code = pd.concat([df_index_code[df_index_code['TYPE']==1].head(10), df_index_code[df_index_code['TYPE']==0].head(10)], sort=False) for index, row in df_index_code.iterrows(): print(row['DESCRIPTION']) df[row['DESCRIPTION']] = (data['codes'] == row['INDEX_CODE']).max(axis=1) cols_list = cols_list + [row['DESCRIPTION']] ################################################################### print('Fitting...') cph = CoxPHFitter() cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5) cph.print_summary() print('done')
def _fit_cox(self): """ private method to fit Cox model """ if self._cf is not None: return cox_df1 = pd.DataFrame(self.survival0.df, columns=[self.time_col1, self.event_col1]) cox_df1[self.survival1.label] = 0 cox_df2 = pd.DataFrame(self.survival1.df, columns=[self.time_col2, self.event_col2]) if self.time_col1 != self.time_col2: cox_df2 = cox_df2.rename(columns={self.time_col2: self.time_col1}) if self.event_col1 != self.event_col2: cox_df2 = cox_df2.rename( columns={self.event_col2: self.event_col1}) cox_df2[self.survival1.label] = 1 cox_df = cox_df1.append(cox_df2, ignore_index=True) cox_fitted = CoxPHFitter(normalize=False) cox_fitted.fit(cox_df, self.time_col1, event_col=self.event_col1, include_likelihood=False) self._cf = cox_fitted
def test_coxph_plot_partial_effects_on_outcome_with_single_strata(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest", strata="paro") cp.plot_partial_effects_on_outcome("age", [10, 50, 80]) self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_strata") self.plt.show(block=block)
def coxreg_single_run(xtr, ytr, penalty): df_tr = pd.DataFrame(np.concatenate((ytr, xtr), axis=1)) df_tr.columns = ['status', 'time' ] + ['X' + str(i + 1) for i in range(xtr.shape[1])] cph = CoxPHFitter(penalizer=penalty) cph.fit(df_tr, duration_col='time', event_col='status') return cph
def test_spline_coxph_plot_partial_effects_on_outcome_with_strata(self, block): df = load_rossi() cp = CoxPHFitter(baseline_estimation_method="spline", n_baseline_knots=2) cp.fit(df, "week", "arrest", strata=["wexp"]) cp.plot_partial_effects_on_outcome("age", [10, 50, 80]) self.plt.title("test_spline_coxph_plot_partial_effects_on_outcome_with_strata") self.plt.show(block=block)
def fit_cox(subset, name, duration_col='days_survival', event_col='vital_status', *args, **kwargs): ''' use lifelines to fit COXPHFitter model. return summary plus the corrected p-value subset: DataFrame name: name of the analysis duration_col: column of subset with number of days sample survived event_col: column of subset with 0/1 wheter the sample is alive or dead *args: to be passed to CoxPHFitter **kwargs: to be passed to CoxPHFitter ''' from lifelines import CoxPHFitter from statsmodels.stats.multitest import multipletests cph = CoxPHFitter(*args, **kwargs) try: cph.fit(subset, duration_col=duration_col, event_col=event_col) summary = cph.summary p_vals = multipletests(cph.summary["p"], method="bonferroni")[1] summary["corrected_p"] = p_vals summary["-log2(corrected_p)"] = -np.log2(p_vals) return summary, cph except: print(*sys.exc_info()) return None, None
def test_coxph_plot_partial_effects_on_outcome_with_nonnumeric_strata(self, block): df = load_rossi() df["strata"] = np.random.choice(["A", "B"], size=df.shape[0]) cp = CoxPHFitter() cp.fit(df, "week", "arrest", strata="strata") cp.plot_partial_effects_on_outcome("age", [10, 50, 80]) self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_single_strata") self.plt.show(block=block)
def test_coxph_plot_partial_effects_on_outcome_with_multiple_variables_and_strata(self, block): df = load_rossi() df["strata"] = np.random.choice(["A", "B"], size=df.shape[0]) cp = CoxPHFitter() cp.fit(df, "week", "arrest", strata="strata") cp.plot_partial_effects_on_outcome(["age", "prio"], [[10, 0], [50, 10], [80, 90]]) self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_multiple_variables_and_strata") self.plt.show(block=block)
def fit(self, X, y, **fit_params): X_ = X.copy() X_[self.duration_column]=y[self.duration_column] if self.event_col is not None: X_[self.event_col] = y[self.event_col] params = self.get_params() est = CoxPHFitter(**params) est.fit(X_, duration_col=self.duration_column, event_col=self.event_col, initial_beta=self.initial_beta, include_likelihood=self.include_likelihood, strata=self.strata, **fit_params) self.estimator = est return self
def cox_regression(clean_df): cf = CoxPHFitter() cf.fit(clean_df, 'time', event_col='event') summary_df = cf.summary #decimals = pd.Series([2, 2, 2], index=['exp(coef)', 'lower 0.95', 'upper 0.95']) #summary_df = summary_df.round(decimals) ori_dic = summary_df.to_dict() res_dic= {} for stat_of_interest in stats_of_interest: if stat_of_interest != 'p': res_dic[stat_of_interest] = round_dic(ori_dic[stat_of_interest]) else: res_dic[stat_of_interest] = round_dic_eng(ori_dic[stat_of_interest]) return res_dic
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]): """Estimates treatment efficacy using proportional hazards (Cox model). Parameters ---------- df : pandas.DataFrame treatment_col : string Column in df indicating treatment. duration_col : string Column in df indicating survival times. event_col : string Column in df indicating events (censored data are 0) covars : list List of other columns to include in Cox model as covariates. Returns ------- est : float Estimate of vaccine efficacy ci : vector, length 2 95% confidence interval, [LL, UL] pvalue : float P-value for H0: VE=0""" coxphf = CoxPHFitter() coxphf.fit(df[[treatment_col, duration_col, event_col]+covars], duration_col=duration_col, event_col=event_col) te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col]) ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound', 'lower-bound']]) pvalue = coxphf._compute_p_values()[0] ind1 = df[treatment_col] == 0 ind2 = df[treatment_col] == 1 results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2]) index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model'] return pd.Series([te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf], index=index)
tx = df['history_of_neoadjuvant_treatment']=='Yes' ax = plt.subplot(111) kmf1 = KaplanMeierFitter(alpha=0.95) kmf1.fit(durations=df.ix[tx, survival_col], event_observed=df.ix[tx, censor_col], label=['Tx==Yes']) kmf1.plot(ax=ax, show_censors=True, ci_show=False) kmf2 = KaplanMeierFitter(alpha=0.95) kmf2.fit(durations=df.ix[~tx, survival_col], event_observed=df.ix[~tx, censor_col], label=['Tx==No']) kmf2.plot(ax=ax, show_censors=True, ci_show=False ) add_at_risk_counts(kmf1, kmf2, ax=ax) plt.title ('Acute myeloid leukemia survival analysis with Tx and without Tx') plt.xlabel(survival_col) plt.savefig('km.png') results = logrank_test(df.ix[tx, survival_col], df.ix[~tx, survival_col], df.ix[tx, censor_col], df.ix[~tx, censor_col], alpha=.99 ) results.print_summary() cox = CoxPHFitter(normalize=False) df_age = df[[survival_col, censor_col, 'age_at_initial_pathologic_diagnosis']] df_age = df_age[pd.notnull(df_age['age_at_initial_pathologic_diagnosis'])] cox = cox.fit(df_age, survival_col, event_col=censor_col, include_likelihood=True) cox.print_summary() scores = k_fold_cross_validation(cox, df_age, survival_col, event_col=censor_col, k=10) print scores print 'Mean score', np.mean(scores) print 'Std', np.std(scores)
# Convert to data frame data = pd.DataFrame({'duration': duration, 'event': not_censor, 'age': age, 'college': college}) # Plot observations with censoring # plot_lifetimes(duration, event_observed = not_censor) # Kaplan Meier Summary for Simulated Data from lifelines import KaplanMeierFitter kmf = KaplanMeierFitter() kmf.fit(duration, event_observed = not_censor) kmf.survival_function_.plot() # Cox-PH Model Regression from lifelines import CoxPHFitter cf = CoxPHFitter() cf.fit(data, 'duration', event_col = 'event') cf.print_summary() ## Get Predictions from Model ## # 24 year old college grad #college_24 = pd.DataFrame({'age':[24], 'college':[1]}) #cf.predict_survival_function(college_24).plot() # 65 year old high school grad #hs_65 = pd.DataFrame({'age':[65], 'college':[0]}) #cf.predict_survival_function(hs_65).plot() # Predicted Survival for 24yr-old College Grad and 65yr-old HS Grad mixed = pd.DataFrame({'age':[24, 65,42], 'college':[1,0,.4], 'index': ['24yr old College Grad','65yr old HS Grad','Average']}) mixed = mixed.set_index(['index']) # setting row names
def multivariate(df): from lifelines import CoxPHFitter cph = CoxPHFitter() cph.fit(df, duration_col='time', event_col='status', show_progress=True) cph.print_summary() # access the results using cph.summary
from lifelines.datasets import load_regression_dataset regression_dataset = load_regression_dataset() regression_dataset.head() from lifelines import AalenAdditiveFitter, CoxPHFitter # Using Cox Proportional Hazards model cf = CoxPHFitter() cf.fit(regression_dataset, 'T', event_col='E') cf.print_summary() # Using Aalen's Additive model aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, 'T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E','T']] aaf.predict_survival_function(x.ix[10:12]).plot() #get the unique survival functions of the first two subjects
from lifelines.datasets import generate_regression_dataset regression_dataset = generate_regression_dataset() from lifelines import AalenAdditiveFitter, CoxPHFitter cf = CoxPHFitter() cf.fit(regression_dataset, duration_col='T', event_col='E') aaf = AalenAdditiveFitter(fit_intercept=False) aaf.fit(regression_dataset, duration_col='T', event_col='E') x = regression_dataset[regression_dataset.columns - ['E','T']] aaf.predict_survival_function(x.ix[10:12]).plot() aaf.plot()
""" # print cancer['T'].unique() # print cancer['E'].unique() # cancer = cancer.dropna() # the '-1' term # refers to not adding an intercept column (a column of all 1s). # It can be added to the Fitter class. covMatrix = cancer.cov() cf = CoxPHFitter() cf.fit(covMatrix, "T", event_col="E") # extra paramater for categorical , strata=catVar cf.print_summary() curve = cf.predict_survival_function(cancer) curve.plot() plt.show() print "hazard coeff", cf.hazards_ print "baseline ", cf.baseline_hazard_ """ scores = k_fold_cross_validation(cf, covMatrix, 'T', event_col='E', k=3) print scores print np.mean(scores) print np.std(scores) """
if __name__ == '__main__': logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) args = parse_args() print("Arguments:",args) # Load Dataset print("Loading datasets: " + args.dataset) datasets = utils.load_datasets(args.dataset) # Train CPH model print("Training CPH Model") train_df = utils.format_dataset_to_df(datasets['train'], DURATION_COL, EVENT_COL) cf = CoxPHFitter() results = cf.fit(train_df, duration_col=DURATION_COL, event_col=EVENT_COL, include_likelihood=True) cf.print_summary() print("Train Likelihood: " + str(cf._log_likelihood)) if 'valid' in datasets: metrics = evaluate_model(cf, datasets['valid']) print("Valid metrics: " + str(metrics)) if 'test' in datasets: metrics = evaluate_model(cf, datasets['test'], bootstrap=True) print("Test metrics: " + str(metrics)) print("Saving Visualizations") if 'test' in datasets and args.treatment_idx is not None: print("Calculating treatment recommendation survival curvs") # We use the test dataset because these experiments don't have a viz dataset
def _plot_kmf_single(df, condition_col, survival_col, censor_col, threshold, title, xlabel, ylabel, ax, with_condition_color, no_condition_color, with_condition_label, no_condition_label, color_map, label_map, color_palette, ci_show, print_as_title): """ Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col. All inputs are required - this function is intended to be called by `plot_kmf`. """ # make color inputs consistent hex format if colors.is_color_like(with_condition_color): with_condition_color = colors.to_hex(with_condition_color) if colors.is_color_like(no_condition_color): no_condition_color = colors.to_hex(no_condition_color) ## prepare data to be plotted; producing 3 outputs: # - `condition`, series containing category labels to be plotted # - `label_map` (mapping condition values to plot labels) # - `color_map` (mapping condition values to plotted colors) if threshold is not None: is_median = threshold == "median" if is_median: threshold = df[condition_col].median() label_suffix = float_str(threshold) condition = df[condition_col] > threshold default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix) if is_median: label_suffix += " (median)" default_label_with_condition = "%s > %s" % (condition_col, label_suffix) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category": condition = df[condition_col].astype("category") if not label_map: label_map = dict() [label_map.update({condition_value: '{} = {}'.format(condition_col, condition_value)}) for condition_value in condition.unique()] if not color_map: rgb_values = sb.color_palette(color_palette, len(label_map.keys())) hex_values = [colors.to_hex(col) for col in rgb_values] color_map = dict(zip(label_map.keys(), hex_values)) elif df[condition_col].dtype == 'bool': condition = df[condition_col] default_label_with_condition = "= {}".format(condition_col) default_label_no_condition = "¬ {}".format(condition_col) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} else: raise ValueError('Don\'t know how to plot data of type\ {}'.format(df[condition_col].dtype)) # produce kmf plot for each category (group) identified above kmf = KaplanMeierFitter() grp_desc = list() grp_survival_data = dict() grp_event_data = dict() grp_names = list(condition.unique()) for grp_name, grp_df in df.groupby(condition): grp_survival = grp_df[survival_col] grp_event = (grp_df[censor_col].astype(bool)) grp_label = label_map[grp_name] grp_color = color_map[grp_name] kmf.fit(grp_survival, grp_event, label=grp_label) desc_str = "# {}: {}".format(grp_label, len(grp_survival)) grp_desc.append(desc_str) grp_survival_data[grp_name] = grp_survival grp_event_data[grp_name] = grp_event if ax: ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color) else: ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color) ## format the plot # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) y_tick_vals = ax.get_yticks() ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals]) # plot title if title: ax.set_title(title) elif print_as_title: ax.set_title(' | '.join(grp_desc)) else: [print(desc) for desc in grp_desc] # axis labels if xlabel: ax.set_xlabel(xlabel) if ylabel: ax.set_ylabel(ylabel) ## summarize analytical version of results ## again using same groups as are plotted if len(grp_names) == 2: # use log-rank test for 2 groups results = logrank_test(grp_survival_data[grp_names[0]], grp_survival_data[grp_names[1]], event_observed_A=grp_event_data[grp_names[0]], event_observed_B=grp_event_data[grp_names[1]]) elif len(grp_names) == 1: # no analytical result for 1 or 0 groups results = NullSurvivalResults() else: # cox PH fitter for >2 groups cf = CoxPHFitter() cox_df = patsy.dmatrix('+'.join([condition_col, survival_col, censor_col]), df, return_type='dataframe') del cox_df['Intercept'] results = cf.fit(cox_df, survival_col, event_col=censor_col) results.print_summary() # add metadata to results object so caller can print them results.survival_data_series = grp_survival_data results.event_data_series = grp_event_data results.desc = grp_desc return results