def evaluate_HR(survival_dict, scores, pids, patient_labels, hr_days=90, mode="continuous"): df_list = [] for risk_score, pid, outcome in zip(scores, pids, patient_labels): death_date = survival_dict[int(pid)] patient_dict = { 'risk': risk_score, 'pid': pid, 'death': outcome, 'days_survived': death_date } df_list.append(patient_dict) patient_df = pd.DataFrame(df_list) hr_days_opts = [90] hr_vals = [] for hr_days in hr_days_opts: patient_df.loc[patient_df['days_survived'] > hr_days, 'death'] = 0 patient_df.loc[patient_df['death'] == 0, 'days_survived'] = 100 cph = CoxPHFitter() m = cph.fit(patient_df, duration_col='days_survived', event_col='death') hr_vals.append(np.exp(m.hazards_['risk'][0])) return hr_vals
def test_proportional_hazard_test_with_weights_and_strata(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1), "s" = c(1, 1, 0, 0, 0) ) c = coxph(formula=Surv(T, E) ~ var1 + strata(s), data=df, weights=w) cz = cox.zph(c, transform='identity') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], "s": [1, 1, 0, 0, 0], }) df["E"] = True cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w", strata="s", robust=True) results = stats.proportional_hazard_test(cph, df, time_transform="identity") cph.print_summary() npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.0283, rtol=1e-3)
def test_coxph_plotting(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot() self.plt.title("test_coxph_plotting") self.plt.show(block=block)
def _compute_likelihood_ratio_test(self): """ This function computes the likelihood ratio test for the Cox model. We compare the existing model (with all the covariates) to the trivial model of no covariates. Conveniently, we can actually use CoxPHFitter class to do most of the work. """ trivial_dataset = self.start_stop_and_events.groupby(level=0).last()[[ "event", "stop" ]] weights = self.weights.groupby(level=0).last() trivial_dataset = trivial_dataset.join(weights).sort_values("stop") ll_null = CoxPHFitter()._trivial_log_likelihood_single( trivial_dataset["stop"].values, trivial_dataset["event"].values, trivial_dataset["__weights"].values) ll_alt = self._log_likelihood test_stat = 2 * (ll_alt - ll_null) degrees_freedom = self.hazards_.shape[0] p_value = chisq_test(test_stat, degrees_freedom=degrees_freedom) with np.errstate(invalid="ignore", divide="ignore"): return test_stat, degrees_freedom, -np.log2(p_value)
def _fit_cox(self): """ private method to fit Cox model """ if self._cf is not None: return cox_df1 = pd.DataFrame(self.survival0.df, columns=[self.time_col1, self.event_col1]) cox_df1[self.survival1.label] = 0 cox_df2 = pd.DataFrame(self.survival1.df, columns=[self.time_col2, self.event_col2]) if self.time_col1 != self.time_col2: cox_df2 = cox_df2.rename(columns={self.time_col2: self.time_col1}) if self.event_col1 != self.event_col2: cox_df2 = cox_df2.rename( columns={self.event_col2: self.event_col1}) cox_df2[self.survival1.label] = 1 cox_df = cox_df1.append(cox_df2, ignore_index=True) cox_fitted = CoxPHFitter(normalize=False) cox_fitted.fit(cox_df, self.time_col1, event_col=self.event_col1, include_likelihood=False) self._cf = cox_fitted
def test_coxph_plot_covariate_groups_with_single_strata(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest", strata="paro") cp.plot_covariate_groups("age", [10, 50, 80]) self.plt.title("test_coxph_plot_covariate_groups_with_strata") self.plt.show(block=block)
def test_proportional_hazard_test_with_weights(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1) ) c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w) cox.zph(c, transform='rank') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], }) df["E"] = True cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w") results = stats.proportional_hazard_test( cph, df, time_transform=["km", "rank", "log", "identity"]) results.print_summary(5) npt.assert_allclose(results.summary.loc["var1", "rank"]["test_statistic"], 0.108, rtol=1e-2)
def main(): # Load data print('Load data...') hp = Hyperparameters() data = np.load('../' + hp.data_pp_dir + 'data_arrays_' + hp.gender + '.npz') print('Use all data for model fitting...') x = data['x'] time = data['time'] event = data['event'] cols_list = load_obj('../' + hp.data_pp_dir + 'cols_list.pkl') df = pd.DataFrame(x, columns=cols_list) df['TIME'] = time df['EVENT'] = event ################################################################### print('Add additional columns...') df_index_code = feather.read_dataframe('../' + hp.results_dir + 'hr_addcodes_' + hp.gender + '.feather') df_index_code = pd.concat([df_index_code[df_index_code['TYPE']==1].head(10), df_index_code[df_index_code['TYPE']==0].head(10)], sort=False) for index, row in df_index_code.iterrows(): print(row['DESCRIPTION']) df[row['DESCRIPTION']] = (data['codes'] == row['INDEX_CODE']).max(axis=1) cols_list = cols_list + [row['DESCRIPTION']] ################################################################### print('Fitting...') cph = CoxPHFitter() cph.fit(df, duration_col='TIME', event_col='EVENT', show_progress=True, step_size=0.5) cph.print_summary() print('done')
def getHazardRatio(df_col, os, event, genename, value, binary=False, age=None, return_sign=False): cph = CoxPHFitter() os_data = pd.DataFrame({'Gene': df_col, 'Duration': os, 'Flag': event}) if age is not None: os_data['Age'] = age try: cph.fit(os_data, 'Duration', 'Flag', show_progress=False) except ValueError: print('Not working, returning nans') return genename, value, np.nan, df_col.sum() hazard_ratio = np.exp(cph.hazards_['Gene'].values) if binary: if hazard_ratio < 1: hazard_ratio = 1/hazard_ratio value = 1 if return_sign: return genename, value, hazard_ratio[0], df_col.sum() else: return hazard_ratio
def f(train,threshold,test): hi=h(train) h_score=pd.DataFrame(hi, index=np.array(range(1,21149))) gene_ls=h_score.index[h_score.iloc[:,0]>1].tolist() candidate_genes=['V{0}'.format(element) for element in gene_ls] # qualified genes were selected stdsc = preprocessing.StandardScaler() np_scaled_train = stdsc.fit_transform(train.loc[:,candidate_genes]) np_scaled_test = stdsc.transform(test.loc[:,candidate_genes]) pca = sklearnPCA(n_components=1) X_train_pca = pca.fit_transform(np_scaled_train) # This is the result X_test_pca = pca.transform(np_scaled_test) eigen_val=pca.explained_variance_ #eigen value is the explained variance #assign pca score to the test dataset test=test.assign(w=pd.Series(np.ones(len(test.patient_id)))) test['w']=X_test_pca testset_surv=test[['event_free_survival_time_days','death','w']] #do cox-regression # Using Cox Proportional Hazards model cph = CoxPHFitter() cph.fit(testset_surv,'event_free_survival_time_days',event_col='death') return cph.print_summary()
def _compute_likelihood_ratio_test(self): """ This function computes the likelihood ratio test for the Cox model. We compare the existing model (with all the covariates) to the trivial model of no covariates. Conveniently, we can actually use another class to do most of the work. """ trivial_dataset = self.start_stop_and_events.groupby(level=0).last()[[ "event", "stop" ]] weights = self.weights.groupby(level=0).last()[["__weights"]] trivial_dataset = trivial_dataset.join(weights) cp_null = CoxPHFitter() cp_null.fit(trivial_dataset, "stop", "event", weights_col="__weights", show_progress=False) ll_null = cp_null._log_likelihood ll_alt = self._log_likelihood test_stat = 2 * ll_alt - 2 * ll_null degrees_freedom = self.hazards_.shape[1] _, p_value = chisq_test(test_stat, degrees_freedom=degrees_freedom, alpha=0.0) return test_stat, degrees_freedom, np.log(p_value)
def comp_cph(endpoint, sex, df_events, df_info): """Prepare data and fit a Cox PH model for the given endpoint""" logger.info(f"{endpoint} - {sex} - Computing cumulative incidence") logger.debug(f"{endpoint} - {sex} - Assigning cases and controls") # Cases df_cases = df_events.loc[df_events.ENDPOINT == endpoint, ["FINNGENID", "ENDPOINT_AGE"]] if df_cases.shape[0] < MIN_CASES: raise NotEnoughCases(f"Not enough cases (< {MIN_CASES}).") # Take all individual, also dealing with sex-specific endpoints df_all = df_info.loc[df_info.SEX == sex, ["FINNGENID", "FU_END_AGE"]] df_all = df_all.merge(df_cases, how="left", on="FINNGENID") df_all["outcome"] = ~df_all.ENDPOINT_AGE.isna( ) # ENDPOINT_AGE is NaN for controls df_all["duration"] = df_all.FU_END_AGE df_all.loc[df_all.outcome, "duration"] = df_all.loc[df_all.outcome, "ENDPOINT_AGE"] # Trim down the columns so the later call to cph.fit() doesn't try to use extra columns dfcox = df_all.loc[:, ["outcome", "duration"]] logger.debug(f"{endpoint} - Fitting Cox model") cph = CoxPHFitter() cph.fit(dfcox, duration_col="duration", event_col="outcome") return dfcox, cph
def main(data_df): for key in th_dict.keys(): if not key.find("HU") > 0: data_df[key] = data_df[key].fillna(0) data_df[key] = data_df[key].map(lambda input: 1 if input >= th_dict[key] else 0) add_DF = pd.DataFrame() add_DF["V-HU"] = data_df['HU_of_consolidation'] + data_df[ 'Volume_of_total_pneumonia_infection'] #0,1,2 combinations_df = pd.concat( [ data_df["Duration"], data_df["Death"], data_df["Age"], data_df["Blood_Oxygen"], data_df["C-Reactive_protein"], #data_df["White_blood_cell_count"] , data_df["Lymphocyte_count"], data_df["Cerebrovascular_Disease"], data_df["Sex"], #data_df["Neutrophil_count"], #data_df["D-dimer"] , data_df["Lactic_dehydrogenase"], add_DF["V-HU"], ], axis=1) cph = CoxPHFitter() cph.fit(combinations_df, "Duration", event_col="Death", step_size=0.01) cph.print_summary()
def coxcalc(df, x, survivaltime, status): df5 = df[[status, survivaltime, x]] df5[x] = pd.to_numeric(df5[x]) df5 = df5.dropna() cph = CoxPHFitter() cph.fit(df5, duration_col=survivaltime, event_col=status, show_progress=False) return cph.summary
def test_coxph_plotting_with_subset_of_columns(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(columns=["var1", "var2"]) self.plt.title("test_coxph_plotting_with_subset_of_columns") self.plt.show(block=block)
def survival(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []): """ duration_col: survival time event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes other_cols: other variables to consider in the regression """ phenotype_df = phenotype_df.join(row.astype(float)) phenotype_df[duration_col] = phenotype_df[duration_col].astype(float) phenotype_df[event_col] = phenotype_df[event_col].astype(int) # The following lines deal with char conflicts in patsy formulas duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_') event_col = event_col.replace(' ','_').replace('.','_').replace('-','_') other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols] row.name = row.name.replace(' ','_').replace('.','_').replace('-','_') phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns] formula = row.name + ' + ' + duration_col + ' + ' + event_col if not not other_cols: other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols] formula = formula + ' + ' + ' + '.join(other_cols) X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe') X = X.drop(['Intercept'], axis = 1) cph = CoxPHFitter() cph.fit(X, duration_col = duration_col, event_col = event_col) result = cph.summary.loc[row.name] return result
def test_coxph_plot_covariate_groups(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest") cp.plot_covariate_groups("age", [10, 50, 80]) self.plt.title("test_coxph_plot_covariate_groups") self.plt.show(block=block)
def test_coxph_plot_covariate_groups_with_multiple_variables(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest") cp.plot_covariate_groups(["age", "prio"], [[10, 0], [50, 10], [80, 90]]) self.plt.title("test_coxph_plot_covariate_groups_with_multiple_variables") self.plt.show(block=block)
def test_cross_validator_returns_k_results(): cf = CoxPHFitter() results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col="T", event_col="E", k=3) assert len(results) == 3 results = utils.k_fold_cross_validation(cf, load_regression_dataset(), duration_col="T", event_col="E", k=5) assert len(results) == 5
def fit_cox(subset, name, duration_col='days_survival', event_col='vital_status', *args, **kwargs): ''' use lifelines to fit COXPHFitter model. return summary plus the corrected p-value subset: DataFrame name: name of the analysis duration_col: column of subset with number of days sample survived event_col: column of subset with 0/1 wheter the sample is alive or dead *args: to be passed to CoxPHFitter **kwargs: to be passed to CoxPHFitter ''' from lifelines import CoxPHFitter from statsmodels.stats.multitest import multipletests cph = CoxPHFitter(*args, **kwargs) try: cph.fit(subset, duration_col=duration_col, event_col=event_col) summary = cph.summary p_vals = multipletests(cph.summary["p"], method="bonferroni")[1] summary["corrected_p"] = p_vals summary["-log2(corrected_p)"] = -np.log2(p_vals) return summary, cph except: print(*sys.exc_info()) return None, None
def test_proportional_hazard_test_with_kmf_with_some_censorship(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 0, 1) ) c = coxph(formula=Surv(T, E) ~ var1 , data=df) cox.zph(c, transform='km') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "E": [1, 1, 1, 0, 1], }) cph = CoxPHFitter() cph.fit(df, "T", "E") results = stats.proportional_hazard_test(cph, df) npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 1.013802, rtol=1e-3)
def test_coxph_plot_partial_effects_on_outcome(self, block): df = load_rossi() cp = CoxPHFitter() cp.fit(df, "week", "arrest") cp.plot_partial_effects_on_outcome("age", [10, 50, 80]) self.plt.title("test_coxph_plot_partial_effects_on_outcome") self.plt.show(block=block)
def coxreg_single_run(xtr, ytr, penalty): df_tr = pd.DataFrame(np.concatenate((ytr, xtr), axis=1)) df_tr.columns = ['status', 'time' ] + ['X' + str(i + 1) for i in range(xtr.shape[1])] cph = CoxPHFitter(penalizer=penalty) cph.fit(df_tr, duration_col='time', event_col='status') return cph
def test_coxph_plot_partial_effects_on_outcome_with_strata_and_complicated_dtypes( self, block): # from https://github.com/CamDavidsonPilon/lifelines/blob/master/examples/Customer%20Churn.ipynb churn_data = pd.read_csv("https://raw.githubusercontent.com/" "treselle-systems/customer_churn_analysis/" "master/WA_Fn-UseC_-Telco-Customer-Churn.csv") churn_data = churn_data.set_index("customerID") churn_data = churn_data.drop(["TotalCharges"], axis=1) churn_data = churn_data.applymap(lambda x: "No" if str(x).startswith("No ") else x) churn_data["Churn"] = churn_data["Churn"] == "Yes" strata_cols = ["InternetService"] cph = CoxPHFitter().fit( churn_data, "tenure", "Churn", formula= "gender + SeniorCitizen + Partner + Dependents + MultipleLines + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges", strata=strata_cols, ) cph.plot_partial_effects_on_outcome( "Contract", values=["Month-to-month", "One year", "Two year"], plot_baseline=False) self.plt.title( "test_coxph_plot_partial_effects_on_outcome_with_strata_and_complicated_dtypes" ) self.plt.show(block=block)
def test_proportional_hazard_test_with_weights(): """ library(survival) df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "T" = c(5.269797, 6.601666, 7.335846, 11.684092, 12.678458), "E" = c(1, 1, 1, 1, 1), "w" = c(1, 0.5, 2, 1, 1) ) c = coxph(formula=Surv(T, E) ~ var1 , data=df, weights=w) cox.zph(c, transform='rank') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "T": [5.269797, 6.601666, 7.335846, 11.684092, 12.678458], "w": [1, 0.5, 2, 1, 1], }) df["E"] = True with pytest.warns(StatisticalWarning, match="weights are not integers"): cph = CoxPHFitter() cph.fit(df, "T", "E", weights_col="w") results = stats.proportional_hazard_test(cph, df) npt.assert_allclose(results.summary.loc["var1"]["test_statistic"], 0.1083698, rtol=1e-3)
def test_coxph_plotting_with_hazards_ratios(self, block): df = load_regression_dataset() cp = CoxPHFitter() cp.fit(df, "T", "E") cp.plot(hazard_ratios=True) self.plt.title("test_coxph_plotting") self.plt.show(block=block)
def DoFeatureSelectionCPH(self, x, c, s, xnames, fold, sel_f_num, dev_index): variance_th = 0.15 xdf = pd.DataFrame(x, columns=xnames) sel_idx = xdf.std() > variance_th #true or false xdf = xdf.loc[:, sel_idx] xnames = xnames[sel_idx] x = xdf.values gene_p_value = [] for i in tqdm(range(0, x.shape[1])): subset_num = i cph_h_trn_stack = np.column_stack( (x[:, subset_num:subset_num + 1], c, s)) cph_cols = xnames.copy().tolist()[subset_num:subset_num + 1] cph_cols.append('E') cph_cols.append('S') cph_train_df = pd.DataFrame(cph_h_trn_stack, columns=cph_cols) cph = CoxPHFitter() cph.fit(cph_train_df, duration_col='S', event_col='E', step_size=0.1, show_progress=False) f_scores = pd.DataFrame(cph.summary)['p'].values gene_p_value.append(f_scores[0]) gene_p_value = np.asarray(gene_p_value) sort_idx = np.argsort(gene_p_value) f_name_sort = np.asarray(xnames)[sort_idx] f_score_sort = gene_p_value[sort_idx] return sort_idx, f_name_sort, f_score_sort #, auc
def test_coxph_plot_partial_effects_on_outcome_with_multiple_variables_and_strata(self, block): df = load_rossi() df["strata"] = np.random.choice(["A", "B"], size=df.shape[0]) cp = CoxPHFitter() cp.fit(df, "week", "arrest", strata="strata") cp.plot_partial_effects_on_outcome(["age", "prio"], [[10, 0], [50, 10], [80, 90]]) self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_multiple_variables_and_strata") self.plt.show(block=block)
def test_cross_validator_with_specific_loss_function(): cf = CoxPHFitter() results_sq = utils.k_fold_cross_validation( cf, load_regression_dataset(), scoring_method="concordance_index", duration_col="T", event_col="E")
def test_coxph_plot_partial_effects_on_outcome_with_nonnumeric_strata(self, block): df = load_rossi() df["strata"] = np.random.choice(["A", "B"], size=df.shape[0]) cp = CoxPHFitter() cp.fit(df, "week", "arrest", strata="strata") cp.plot_partial_effects_on_outcome("age", [10, 50, 80]) self.plt.title("test_coxph_plot_partial_effects_on_outcome_with_single_strata") self.plt.show(block=block)