def test_brier_times_too_large(nottingham_prognostic_index): pred, y = nottingham_prognostic_index([1825]) with pytest.raises( ValueError, match="all times must be within follow-up time of test data:"): brier_score(y, y, pred, times=9999)
def test_brier_nottingham_many(nottingham_prognostic_index): times = [365, 730, 1095, 1460, 1825] pred, y = nottingham_prognostic_index(times) expected_score = numpy.array([ 0.0762922458520448, 0.182536421174199, 0.220017747254941, 0.234133800146671, 0.233822955042198, ]) t1, score = brier_score(y, y, pred.squeeze(), times=times) assert_array_almost_equal(score, expected_score) t2, score = brier_score(y, y, pred.squeeze(), times=times[::-1]) assert_array_almost_equal(score, expected_score) assert_array_equal(t1, t2)
def compute_Brier_scores(model, quantiles, DATA_te, DATA_tr, pretrain_state, risk=0): '''Compute Brrier score based on CDF Inputs: model: trained model quantiles: DATA_te: passed test Data (featurs) DATA_tr: passed train Data (featurs) t_horizon: a vector of times to calculate cdf pretrain_state: show we use pretrain model or not ''' cdf_preds = predict_cdf(model, DATA_te, quantiles, pretrain_state) cdf_preds = [np.exp(cdf.numpy()) for cdf in cdf_preds] _, t_valid, e_valid = DATA_te _, t_train, e_train = DATA_tr e_train = e_train.astype('bool') e_valid = e_valid.astype('bool') et1 = np.array([(e_train[i], t_train[i]) for i in range(len(e_train))], dtype=[('e', bool), ('t', int)]) et2 = np.array([(e_valid[i], t_valid[i]) for i in range(len(e_valid))], dtype=[('e', bool), ('t', int)]) cdf_br_25 = brier_score(et1, et2, cdf_preds[0], quantiles[0]) cdf_br_50 = brier_score(et1, et2, cdf_preds[1], quantiles[1]) cdf_br_75 = brier_score(et1, et2, cdf_preds[2], quantiles[2]) cdf_br_m = brier_score(et1, et2, cdf_preds[3], quantiles[3]) return np.squeeze(cdf_br_25[1]), np.squeeze(cdf_br_50[1]), np.squeeze( cdf_br_75[1]), np.squeeze(cdf_br_m[1])
def test_brier_coxph(): X, y = load_gbsg2() X.loc[:, "tgrade"] = X.loc[:, "tgrade"].map(len).astype(int) Xt = OneHotEncoder().fit_transform(X) est = CoxPHSurvivalAnalysis(ties="efron").fit(Xt, y) survs = est.predict_survival_function(Xt) preds = [fn(1825) for fn in survs] _, score = brier_score(y, y, preds, 1825) assert round(abs(score[0] - 0.208817407492645), 5) == 0
def train_model(): from dsm import datasets, DeepSurvivalMachines import numpy as np from sksurv.metrics import concordance_index_ipcw, brier_score survival_data = np.loadtxt('./new_survival_data.csv', delimiter=',') features = np.loadtxt('./new_features.csv', delimiter=',') x = features t = survival_data[:, 0] e = survival_data[:, 1] times = np.quantile(t[e == 1], [0.25, 0.5, 0.75]).tolist() cv_folds = 2 folds = list(range(cv_folds))*10000 folds = np.array(folds[:len(x)]) cis = [] brs = [] for fold in range(cv_folds): print("On Fold:", fold) x_train, t_train, e_train = x[folds != fold], t[folds != fold], e[folds != fold] x_test, t_test, e_test = x[folds == fold], t[folds == fold], e[folds == fold] print(x_train.shape) model = DeepSurvivalMachines(distribution='Weibull', layers=[100]) model.fit(x_train, t_train, e_train, iters=10, learning_rate=1e-3, batch_size=10) et_train = np.array([(e_train[i], t_train[i]) for i in range(len(e_train))], dtype=[('e', bool), ('t', int)]) et_test = np.array([(e_test[i], t_test[i]) for i in range(len(e_test))], dtype=[('e', bool), ('t', int)]) out_risk = model.predict_risk(x_test, times) out_survival = model.predict_survival(x_test, times) cis_ = [] for i in range(len(times)): cis_.append(concordance_index_ipcw(et_train, et_test, out_risk[:, i], times[i])[0]) cis.append(cis_) brs.append(brier_score(et_train, et_test, out_survival, times)[1]) print("Concordance Index:", np.mean(cis, axis=0)) print("Brier Score:", np.mean(brs, axis=0))
def test_brier_wrong_estimate_shape(nottingham_prognostic_index): pred, y = nottingham_prognostic_index([720, 1825]) with pytest.raises(ValueError, match="expected estimate with 2 columns, but got 1"): brier_score(y, y, pred[:, :1], times=[720, 1825]) with pytest.raises(ValueError, match="expected estimate with 3 columns, but got 2"): brier_score(y, y, pred, times=[720, 960, 1825]) with pytest.raises(ValueError, match="expected estimate with 686 samples, but got 10"): brier_score(y, y, pred[:10], times=[720, 1825])
def test_brier_wrong_estimate_shape(nottingham_prognostic_index): pred, y = nottingham_prognostic_index([720, 1825]) with pytest.raises(ValueError, match="expected estimate with 2 columns, but got 1"): brier_score(y, y, pred[:, :1], times=[720, 1825]) with pytest.raises(ValueError, match="expected estimate with 3 columns, but got 2"): brier_score(y, y, pred, times=[720, 960, 1825]) with pytest.raises( ValueError, match= r"Found input variables with inconsistent numbers of samples: \[686, 10\]" ): brier_score(y, y, pred[:10], times=[720, 1825])
def test_brier_nottingham(brier_npi_data): pred, y, times, expected_score = brier_npi_data _, score = brier_score(y, y, pred.squeeze(), times=times) assert round(abs(score[0] - expected_score), 6) == 0
def stratified_brier_score( maximum_brier_eval_time, survival_data_train, survival_data_test, risk_score_train, risk_score_test, strata_train, strata_test, stratified_fitted=True, save_stratified_scores=True, minimum_brier_eval_time=None, ): event_time_train = survival_data_train["event_time"] event_time_test = survival_data_test["event_time"] # Assert values lie within the needed ranges min_strata_train = [ np.min(event_time_train[strata_train == s]) for s in np.unique(strata_train) ] max_strata_train = [ np.max(event_time_train[strata_train == s]) for s in np.unique(strata_train) ] # Get boolean indexer array for event times in test data, which are # smaller/greater than minimum/maximum event time in train data extends_strata_min = [(np.min(event_time_train[strata_train == s]) > event_time_test[strata_test == s]) for s in np.unique(strata_test)] extends_strata_max = [(np.max(event_time_train[strata_train == s]) < event_time_test[strata_test == s]) for s in np.unique(strata_test)] min_strata_test = [] max_strata_test = [] for s, e_min_mask, e_max_mask in zip(np.unique(strata_train), extends_strata_min, extends_strata_max): if e_min_mask.any(): min_strata_test.append( np.min(event_time_test[strata_test == s][~e_min_mask])) else: min_strata_test.append(np.min(event_time_test[strata_test == s])) if e_max_mask.any(): max_strata_test.append( np.max(event_time_test[strata_test == s][~e_max_mask])) else: max_strata_test.append(np.max(event_time_test[strata_test == s])) # Choose the maximum of the minimal values within the strata event_time_strata_min = np.max(min_strata_train + min_strata_test) # Choose the minimum of maximal values within the strata event_time_strata_max = np.min(max_strata_train + max_strata_test) if event_time_strata_max < maximum_brier_eval_time: pec_largest_eval_time = event_time_strata_max else: pec_largest_eval_time = maximum_brier_eval_time if (minimum_brier_eval_time is not None and event_time_strata_min < minimum_brier_eval_time): pec_smallest_eval_time = minimum_brier_eval_time else: pec_smallest_eval_time = event_time_strata_min # Final evaluation times for brier score eval_times_brier_score = np.arange(start=pec_smallest_eval_time, stop=pec_largest_eval_time - 1, step=20) survival_train_groups = [] survival_test_groups = [] risk_train_groups = [] risk_test_groups = [] strata_indicator_train_groups = [] strata_indicator_test_groups = [] for strata in np.unique(strata_train): risk_score_strata_train = risk_score_train[strata == strata_train] risk_score_strata_test = risk_score_test[strata == strata_test] survival_strata_train = survival_data_train[strata == strata_train] survival_strata_test = survival_data_test[strata == strata_test] strata_indicator_train = strata_train[strata == strata_train] strata_indicator_test = strata_test[strata == strata_test] # Check that testing times lie within range of training times. extends_train_min = (np.min(survival_strata_train["event_time"]) > survival_strata_test["event_time"]) if extends_train_min.any(): risk_score_strata_test = risk_score_strata_test[~extends_train_min] survival_strata_test = survival_strata_test[~extends_train_min] strata_indicator_test = strata_indicator_test[~extends_train_min] extends_train_max = (np.max(survival_strata_train["event_time"]) < survival_strata_test["event_time"]) if extends_train_max.any(): risk_score_strata_test = risk_score_strata_test[~extends_train_max] survival_strata_test = survival_strata_test[~extends_train_max] strata_indicator_test = strata_indicator_test[~extends_train_max] survival_train_groups.append(survival_strata_train) survival_test_groups.append(survival_strata_test) risk_train_groups.append(risk_score_strata_train) risk_test_groups.append(risk_score_strata_test) strata_indicator_train_groups.append(strata_indicator_train) strata_indicator_test_groups.append(strata_indicator_test) predictions = [] if stratified_fitted: for train_data, train_risk, test_risk in zip( survival_train_groups, risk_train_groups, risk_test_groups, ): # Fit Breslow Estimator on Training Data. breslow = BreslowEstimator() breslow.fit( train_risk, train_data["event_indicator"], train_data["event_time"], ) # Predict Survival Probability on Test Data. surv_funcs = breslow.get_survival_function(test_risk) prob_preds = [fn(eval_times_brier_score) for fn in surv_funcs] # Append stratified data. predictions.append(prob_preds) else: breslow = BreslowEstimator() breslow.fit( np.concatenate(risk_train_groups), np.concatenate(survival_train_groups)["event_indicator"], np.concatenate(survival_train_groups)["event_time"], ) surv_funcs = breslow.get_survival_function( np.concatenate(risk_test_groups)) prob_preds = [fn(eval_times_brier_score) for fn in surv_funcs] predictions.append(prob_preds) total_brier_scores = [] group_sizes = [] strata_train = np.concatenate(strata_indicator_train_groups) strata_test = np.concatenate(strata_indicator_test_groups) survival_data_train = np.concatenate(survival_train_groups) survival_data_test = np.concatenate(survival_test_groups) predictions = np.concatenate(predictions) for strata in np.unique(strata_train): train_dat = survival_data_train[strata_train == strata] test_dat = survival_data_test[strata_test == strata] preds = predictions[strata_test == strata] # IPCW weights are included in imported function "brier_score" _, strata_brier_score = brier_score( train_dat, test_dat, preds, eval_times_brier_score, ) total_brier_scores.append(strata_brier_score) group_sizes.append(len(preds)) if save_stratified_scores: return eval_times_brier_score, np.average(np.stack(total_brier_scores), weights=group_sizes, axis=0) else: return eval_times_brier_score, total_brier_scores, group_sizes
def main(args): """ Runs evaluation for the data set 1. Loads model from tar.gz 2. Reads in test features 3. Runs an accuracy report 4. Generates feature importance with SHAP Args: model-name (str): Name of the trained model, default xgboost test-features (str): preprocessed test features for evaluation, default test_features.csv train-features (str): preproceed train features for SHAP, default train_features.csv test-features (str): preproceed test features for SHAP, default test_features.csv report-name (str): Name of the evaluation output , default evaluation.json shap-name (str): Name of the SHAP feature importance output file, default shap.csv threshold (float): Threshold to cut probablities at , default 0.5 tau (int): time range for the c-index will be from 0 to tau , default 100 """ model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz") logger.info(f"Extracting model from path: {model_path}") with tarfile.open(model_path) as tar: tar.extractall(path=".") logger.info("Loading model") with open(args.model_name, "rb") as f: model = pickle.load(f) logger.info("Loading train and test data") test_features_data = os.path.join("/opt/ml/processing/test", args.test_features) train_features_data = os.path.join("/opt/ml/processing/train", args.train_features) X_test = pd.read_csv(test_features_data, header=0) X_train = pd.read_csv(train_features_data, header=0) y_test = X_test.iloc[:, 0] y_train = X_train.iloc[:, 0] # Reverse transfrom to event and duration columns y_test_df = pd.DataFrame( np.vstack((np.where(y_test > 0, 1, 0), np.abs(y_test))).T, columns=["event", "duration"], ) y_train_df = pd.DataFrame( np.vstack((np.where(y_train > 0, 1, 0), np.abs(y_train))).T, columns=["event", "duration"], ) X_test.drop(X_test.columns[0], axis=1, inplace=True) X_train.drop(X_test.columns[0], axis=1, inplace=True) logger.info("Running inference") predictions = model.predict(xgboost.DMatrix(X_test.values[:, 1:]), output_margin=False) logger.info("Creating evaluation report") # NOTE: technical evaluation is really not as a classifier # TO DO: Normalize to 0 to 1 scale report_dict = classification_report(y_test_df["event"], predictions > args.threshold, output_dict=True) report_dict["accuracy"] = accuracy_score(y_test_df["event"], predictions > args.threshold) _, y_train_tuple = get_x_y(y_train_df, ["event", "duration"], pos_label=True) _, y_test_tuple = get_x_y(y_test_df, ["event", "duration"], pos_label=True) concordance_index = concordance_index_ipcw( y_train_tuple, y_test_tuple, predictions, tau=args.tau, # default within 100 days ) report_dict["concordance_index"] = { "cindex": float(concordance_index[0]), "concordant": int(concordance_index[1]), "discordant": int(concordance_index[2]), "tied_risk": int(concordance_index[3]), "tied_time": int(concordance_index[4]), } times, score = brier_score(y_train_tuple, y_test_tuple, predictions, y_test_df["duration"].max() - 1) report_dict["brier_score"] = { "times": times.astype(np.int32).tolist(), "score": score.astype(np.float32).tolist(), } logger.info(f"Classification report:\n{report_dict}") evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", args.report_name) logger.info(f"Saving classification report to {evaluation_output_path}") logger.debug(report_dict) with open(evaluation_output_path, "w") as f: f.write(json.dumps(report_dict)) # SHAP latest_job_debugger_artifacts_path = "/opt/ml/processing/debug/debug-output" trial = create_trial(latest_job_debugger_artifacts_path) shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step) pd.DataFrame(shap_values).to_csv( os.path.join("/opt/ml/processing/evaluation", args.shap_name)) shap_no_base = shap_values[1:, :-1] feature_names = X_train.columns os.makedirs("/opt/ml/processing/plot/", exist_ok=True) logger.info(shap_values.shape, shap_no_base.shape, X_train.shape) shap.summary_plot(shap_no_base, features=X_train, feature_names=feature_names, show=False) plt.savefig("/opt/ml/processing/plot/feature_importance.png", bbox_inches="tight")
) for s in np.unique(strata): strata_train_dat = survival_data_train[strata_train == s] strata_test_dat = survival_data_test[strata_test == s] kaplan_preds = np.repeat( [kmf.predict(eval_times_brier_score).to_numpy()], strata_test_dat.shape[0], axis=0, ) times, km_score = brier_score( survival_train=strata_train_dat, survival_test=strata_test_dat, estimate=kaplan_preds, times=eval_times_brier_score, ) kaplan_brier_scores.append(km_score) kaplan_group_sizes.append(strata_test_dat.shape[0]) kmf_brier_scores = np.average(np.stack(kaplan_brier_scores), weights=kaplan_group_sizes, axis=0) lasso_strat_eval_times, lasso_strat_brier_scores = stratified_brier_score( MAX_EVAL_TIME_PEC, survival_data_train, survival_data_test, lasso_linear_predictor_strat_train.to_numpy(),