def do_baseline(foldnum, train, valid, exp_code, model_str): cph = CoxPHFitter() df = pd.DataFrame(train.x) print(df.shape) df['duration'] = train.y df['event'] = [1 if v == 0 else 0 for v in train.c] df = df.fillna(df.mean()) cph.fit(df, 'duration', event_col="event") cph.print_summary() valid_df = pd.DataFrame(valid.x) valid_df = valid_df.fillna(valid_df.mean()) print(cph.predict_log_partial_hazard(valid_df))
ts = ts[ np.random.choice(N, set_size, replace=True) ] es = np.random.binomial(1, (1-censor_rate), set_size) # Create a data-frame for R: df = pd.DataFrame({ 'time' : ts, 'status' : es, 'x1' : np.random.uniform(-1.0, 1.0, set_size)}) # Normalize: df['x1'] = (df['x1'] - df['x1'].mean()) / df['x1'].std() # Compute likelihood with R: r_out = rfunc( df ) preds, r_lik = np.asarray(r_out[0]), np.negative(np.round(r_out[1][0],4)) tf_lik_r = K.eval( efron_estimator_tf(K.variable(ts), K.variable(es), K.variable(preds)) ) # Compute ll with Lifelines: cp = CoxPHFitter() cp.fit(df, 'time', 'status', initial_beta=np.ones((1,1))*0.543, step_size=0.0) preds = cp.predict_log_partial_hazard(df.drop(['time', 'status'], axis=1)).values[:, 0] tf_lik_lifelines = K.eval( efron_estimator_tf(K.variable(ts), K.variable(es), K.variable(preds)) ) print( 'TensorFlow w/ R: ', tf_lik_r ) print( 'R-survival : ', r_lik ) print( 'TensorFlow w/ lifelines: ', tf_lik_lifelines ) print( 'Lifelines : ', np.negative(cp._log_likelihood), end='\n\n') # done.
np.savetxt(time_elapsed_filename, np.array(elapsed).reshape(1, -1)) # --------------------------------------------------------------------- # evaluation # sorted_y_test = np.unique(y_test[:, 0]) surv_df = surv_model.predict_survival_function(X_test_std, sorted_y_test) surv = surv_df.values.T ev = EvalSurv(surv_df, y_test[:, 0], y_test[:, 1], censor_surv='km') cindex_td = ev.concordance_td('antolini') print('c-index (td):', cindex_td) linear_predictors = \ surv_model.predict_log_partial_hazard(X_test_std) cindex = concordance_index(y_test[:, 0], -linear_predictors, y_test[:, 1]) print('c-index:', cindex) time_grid = np.linspace(sorted_y_test[0], sorted_y_test[-1], 100) integrated_brier = ev.integrated_brier_score(time_grid) print('Integrated Brier score:', integrated_brier, flush=True) test_set_metrics = [cindex_td, integrated_brier] rng = np.random.RandomState(bootstrap_random_seed) bootstrap_dir = os.path.join( output_dir, 'bootstrap', '%s_%s_exp%d_test' % (survival_estimator_name, dataset, experiment_idx))
def evaluate_model(self, model, ids_train, ids_valid, ids_test, output_dir): self._reconstruction_plots( model, ids_train, ids_valid, ids_test, output_dir=output_dir) cohort_dfs_encoder = self._get_encoder_features_as_df( model, ids_train, ids_valid, ids_test, output_dir) train_df = cohort_dfs_encoder["training"] # Now do PCA and apply normal cox models feat_cols = [c for c in train_df.columns if c.startswith("feat_")] # info_cols = [c for c in train_df.columns if c not in feat_cols] ret = dict() for pca_dim in self.pca_dims: pca = PCA(n_components=pca_dim) pca.fit(train_df[feat_cols].values) # apply pca transformations on all sets for further # dimensionality reduction pca_dfs = dict() for name, df in cohort_dfs_encoder.items(): output_path = os.path.join( output_dir, "pca_{}comp_features_{}.csv".format( pca_dim, name)) pca_dfs[name] = apply_pca_transform( pca, df, feat_cols, output_path) # now we can combine the datasets to a huge one # containing pca features for training, validation and test patients pca_df_concat = pd.concat( list(pca_dfs.values()), axis=0, sort=False) # evaluate the concordance index of Cox models that use # PCA reduced features of the autoencoder print("\nPCA with {} components\n".format(pca_dim)) drop_cols = [self.data_handler.id_col, "slice_idx", "cohort"] # only survival info is left cox_fitter = CoxPHFitter() try: cox_fitter.fit( pca_dfs["training"].drop(drop_cols, axis=1), duration_col=self.data_handler.time_col, event_col=self.data_handler.event_col, show_progress=False) cox_fitter.print_summary() except Exception as e: print("[W]: Fitting cox model failed! Reason: {}".format(e)) continue # now create the prediction dataframe that we can then use # for computing ci and pvalues easily id_col = self.data_handler.id_col ids = np.unique(pca_df_concat[id_col].values) cohort = [None] * len(ids) slice_idx = [None] * len(ids) pred_risk_per_slice = [None] * len(ids) for i, pat in enumerate(ids): # find all slices for that patient if pat in ids_train: cohort[i] = "training" elif pat in ids_valid: cohort[i] = "validation" elif ids_test is not None and pat in ids_test: cohort[i] = "test" else: msg = "Patient {} could not be assigned to a cohort!".format( pat) raise ValueError(msg) pat_df = pca_df_concat[pca_df_concat[id_col] == pat] haz = cox_fitter.predict_log_partial_hazard( pat_df.drop(drop_cols, axis=1)) hazard = haz.values.flatten() slice_idx[i] = pat_df.slice_idx.values.tolist() pred_risk_per_slice[i] = hazard pred_df = pd.DataFrame({ id_col: ids, 'cohort': cohort, 'slice_idx': slice_idx, 'pred_per_slice': pred_risk_per_slice, 'pred_per_pat(mean)': [ np.mean(slice_preds) for slice_preds in pred_risk_per_slice], 'pred_variance': [ np.var(slice_preds) for slice_preds in pred_risk_per_slice] }) cis = compute_cis( pred_df, self.data_handler.outcome_dict, id_col=id_col) pvals = compute_pvals( pred_df, self.data_handler.outcome_dict, id_col=id_col) performance_df = pd.DataFrame({ 'pca_dim': [pca_dim], 'pca_explained_variance': [ pca.explained_variance_ratio_.tolist()], 'train_ci_slice': [cis['train_ci_slice']], 'p_val_train_slice': [pvals['train_p_slice']], 'train_ci_pat': [cis['train_ci_pat']], 'p_val_train_pat': [pvals['train_p_pat']], 'valid_ci_slice': [cis['valid_ci_slice']], 'p_val_valid_slice': [pvals['valid_p_slice']], 'valid_ci_pat': [cis['valid_ci_pat']], 'p_val_valid_pat': [pvals['valid_p_pat']], 'test_ci_slice': [cis['test_ci_slice']], 'p_val_test_slice': [pvals['test_p_slice']], 'test_ci_pat': [cis['test_ci_pat']], 'p_val_test_pat': [pvals['test_p_pat']]}) subexp_name = "predictions_pca_"+str(pca_dim)+"_comp" ret[subexp_name] = (pred_df, performance_df) subexp_path = os.path.join(output_dir, subexp_name) os.makedirs(subexp_path, exist_ok=True) # kaplan meier and risk_vs_survival plots! plot_km_and_scatter( pred_df, self.data_handler.outcome_dict, output_dir=subexp_path, id_col=id_col) # save the transformation matrix V and the training mean # such that pca_train = (enc_train-mean(enc_train)) * V.T # and we can later work with those models dump(pca, os.path.join( subexp_path, "PCA_" + str(pca_dim) + "comp.joblib")) cox_fitter.summary.to_csv( os.path.join( subexp_path, "cox_{}_pca-comp_summary.csv".format( pca_dim)), index=False) cox_fitter.params_.to_csv( os.path.join( subexp_path, "cox_{}_pca-comp_coefs.csv".format( pca_dim)), index=False) # we return a tuple of prediction_df, performance_df # for each run # of the PCA with different dimensionality return ret