def get_counts_and_clasif(up_to_dataset_dict, timepoint): X_train, y_train, X_test, y_test, xgb_model, input_data = unpack_input_data(up_to_dataset_dict[timepoint]['input_file'], up_to_dataset_dict[timepoint]['model_file']) bin_X_test = X_test.applymap(lambda x: 1 if x > 1 else 0) cor_df = creat_count_df(xgb_model, X_test, y_test) # train classifier tprs, fprs, prs, rcs, thresholds = classifier(X_test, y_test) sorted_fprs, sorted_tprs, sorted_recall, sorted_precision, sorted_thresholds = prep_for_auc(fprs, tprs, prs, rcs, thresholds) auc, pr_auc = calc_auc(sorted_tprs, sorted_fprs,sorted_precision, sorted_recall) pr_chance= np.round(np.sum(y_test)/len(y_test),2) return cor_df, sorted_fprs, sorted_tprs, sorted_recall, sorted_precision, auc, pr_auc, pr_chance
def calc_npv(y_true, y_pred): tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel() return tn/(tn+fn) # ----------- # MAIN # ----------- # %% ### ### get model predictions ### X_train, y_train, X_test, y_test, xgb_model, input_data = unpack_input_data(input_file, model_file) metrics_results, metrics_df, model_params = validate_best_model(xgb_model, X_test, y_test) y_pred, y_proba = get_preds(xgb_model, X_test) pred_df = pd.DataFrame({'GRIDS':X_test.index.tolist(), 'y_true':y_test, 'y_pred':y_pred, 'y_proba':y_proba[:,1]}) # %% ### ### load risk fx ### long_risk_df = harmonize_risk_fx(risk_file_dict, risk_cols_to_keep_dict) long_risk_df['RISK_CAT_LABEL'] = long_risk_df.RISK_CATEGORY +", "+ long_risk_df.RISK_LABEL wide_risk_df = long_risk_df.pivot(index='GRID', columns='RISK_LABEL', values='RISK_CATEGORY') risk_cols = long_risk_df.RISK_LABEL.unique()
# set up paths timeseries = ['0_weeks', '13_weeks','28_weeks', '35_weeks', '37_weeks'] roc_dict = dict() pr_dict = dict() f1_score=dict() for timepoint in timeseries: results_dir = os.path.join(ICDCPT_DIR, f'{timepoint}_notwins_timeseries_v1') input_file = glob.glob(results_dir+"/input_data*.tsv")[0] model_file = glob.glob(results_dir+"/best_xgb_model*.pickle")[0] # load models and input files _, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(input_file, model_file) ehr_metrics_results, ehr_metrics_df, _ = validate_best_model(ehr_xgb_model, ehr_X_test, ehr_y_test) ehr_interp_fpr, ehr_interp_tpr, ehr_auc = get_auroc_coords(ehr_metrics_results) temp_roc_dict = {'interp_fpr':ehr_interp_fpr, 'interp_tpr':ehr_interp_tpr, 'auc':ehr_auc} roc_dict[timepoint] = temp_roc_dict f1_score[timepoint] = {'f1_score': ehr_metrics_results['f1_score'], 'pr_score': ehr_metrics_results['pr_score'], 'rc_score': ehr_metrics_results['rc_score']} ehr_interp_rc, ehr_interp_pr, ehr_pr_auc, ehr_pos_prop = get_pr_coord(ehr_metrics_results, ehr_y_test) temp_pr_dict = {'interp_rc':ehr_interp_rc, 'interp_pr':ehr_interp_pr, 'pr_auc':ehr_pr_auc, 'pos_prop':ehr_pos_prop} pr_dict[timepoint] = temp_pr_dict break roc_dict
### ### MAIN ### ### define file paths ehr_input_file=os.path.join(RF_DIR,'input_data_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.tsv') riskfx_input_file=os.path.join(CLIN_RISK_DIR,'input_data_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.tsv') ### load models and input files ehr_model_file=os.path.join(RF_DIR,'best_xgb_model_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.pickle') riskfx_model_file=os.path.join(CLIN_RISK_DIR,'best_xgb_model_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.pickle') ehr_X_train, ehr_y_train, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data(ehr_input_file, ehr_model_file) riskfx_X_train, riskfx_y_train, riskfx_X_test, riskfx_y_test, riskfx_xgb_model, riskfx_input_data = unpack_input_data(riskfx_input_file, riskfx_model_file) # check that grids match! np.all(ehr_input_data.GRID == riskfx_input_data.GRID) # %% ### ### compare predictions ### ehr_y_pred, ehr_y_proba = get_preds(ehr_xgb_model, ehr_X_test) riskfx_y_pred, riskfx_y_proba = get_preds(riskfx_xgb_model, riskfx_X_test) # measure ppv and npv ehr_ppv = metrics.precision_score(ehr_y_test, ehr_y_pred)
# force a 1,pos_prop end interp_rc = np.hstack((interp_rc, np.array([1]))) interp_pr = np.hstack((interp_pr, np.array([pos_prop]))) return interp_rc, interp_pr, pr_auc, pos_prop # %% ### ### main ### THIS_MODEL ='28_weeks_icd9' # load vu data X_train, y_train, X_test, y_test, xgb_model, input_data = unpack_input_data(vu_dicts[THIS_MODEL]['input_file'], vu_dicts[THIS_MODEL]['model_file']) vu_metrics_results, _, _ = validate_best_model(xgb_model, X_test, y_test) # uc uc_metrics = uc_dicts[THIS_MODEL] interp_fpr, interp_tpr, auc = get_auroc_coords(uc_metrics) interp_rc, interp_pr, pr_auc, pos_prop = get_pr_coord(uc_metrics) # %% import matplotlib.font_manager as fm fpath='/dors/capra_lab/users/abraha1/conda/envs/py36_r_ml/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf/Arial.ttf' prop = fm.FontProperties(fname=fpath, size=11) sprop = fm.FontProperties(fname=fpath, size=9)
vg_input_file = os.path.join( VG_DIR, 'input_data_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_no_twins_count-2020-06-04.tsv' ) csec_model_file = os.path.join( CSEC_DIR, 'best_xgb_model_csection_up_to_28_weeks_since_preg_start_icd9_cpt_no_twins_count-2020-06-04.pickle' ) vg_model_file = os.path.join( VG_DIR, 'best_xgb_model_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_no_twins_count-2020-06-04.pickle' ) # load models and input files csec_X_train, csec_y_train, csec_X_test, csec_y_test, csec_xgb_model, csec_input_data = unpack_input_data( csec_input_file, csec_model_file) vg_X_train, vg_y_train, vg_X_test, vg_y_test, vg_xgb_model, vg_input_data = unpack_input_data( vg_input_file, vg_model_file) csec_no_ptb_test_grids = csec_X_test.reset_index().loc[csec_y_test == 0, 'GRID'].values csec_ptb_test_grids = csec_X_test.reset_index().loc[csec_y_test == 1, 'GRID'].values vg_no_ptb_test_grids = vg_X_test.reset_index().loc[vg_y_test == 0, 'GRID'].values vg_ptb_test_grids = vg_X_test.reset_index().loc[vg_y_test == 1, 'GRID'].values # consider working in probability space.. # explainer = shap.TreeExplainer(csec_xgb_model.get_booster(), data=shap.sample(csec_X_train, 100), model_output='probability') # shap_values = explainer.shap_values(csec_X_train)
RF_DIR, 'input_data_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.tsv') riskfx_input_file = os.path.join( CLIN_RISK_DIR, 'input_data_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.tsv') ehr_model_file = os.path.join( RF_DIR, 'best_xgb_model_up_to_28_weeks_since_preg_start_icd9_cpt_count-2019-06-19.pickle' ) riskfx_model_file = os.path.join( CLIN_RISK_DIR, 'best_xgb_model_up_to_28_weeks_since_preg_start_risk_fx-2020-03-24.pickle') # load models and input files _, _, ehr_X_test, ehr_y_test, ehr_xgb_model, ehr_input_data = unpack_input_data( ehr_input_file, ehr_model_file) _, _, riskfac_X_test, riskfac_y_test, riskfac_xgb_model, riskfac_input_data = unpack_input_data( riskfx_input_file, riskfx_model_file) ehr_metrics_results, ehr_metrics_df, _ = validate_best_model( ehr_xgb_model, ehr_X_test, ehr_y_test) riskfac_metrics_results, riskfac_metrics_df, _ = validate_best_model( riskfac_xgb_model, riskfac_X_test, riskfac_y_test) ### ### plot ### # %% # fig paramaters sns.set(style='whitegrid', font_scale=1.5, rc={'figure.figsize': (6, 6)}) sns.set_style({
vg_input_file = os.path.join( VG_DIR, 'input_data_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.tsv' ) cs_model_file = os.path.join( CSEC_DIR, 'best_xgb_model_csection_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle' ) vg_model_file = os.path.join( VG_DIR, 'best_xgb_model_vaginal_delivery_up_to_28_weeks_since_preg_start_icd9_cpt_count-2020-04-12.pickle' ) # load models and input files _, _, cs_X_test, cs_y_test, cs_xgb_model, cs_input_data = unpack_input_data( cs_input_file, cs_model_file) _, _, vg_X_test, vg_y_test, vg_xgb_model, vg_input_data = unpack_input_data( vg_input_file, vg_model_file) cs_metrics_results, cs_metrics_df, _ = validate_best_model( cs_xgb_model, cs_X_test, cs_y_test) vg_metrics_results, vg_metrics_df, _ = validate_best_model( vg_xgb_model, vg_X_test, vg_y_test) # %% ### ### plot ### # plot - PR mult = 1
long_top_shap_df = pd.melt(top_shap_df, id_vars="GRID", var_name='feat', value_name='feat_shap') top_feat_shap_df = pd.merge(long_top_feat_count_df, long_top_shap_df, on=['GRID','feat'], how='inner') return top_feat_shap_df # %% # ----------- # MAIN # ----------- # ----------- load and melt data ----------- # load feature matrix, labels, and xgboost model X_train, y_train, X_test, y_test, xgb_model, input_df = unpack_input_data(INPUT_DF_FILE, XGB_MODEL_FILE) train_df, train_df_w_labels = extract_train_df(input_df) test_df, test_df_w_labels = extract_test_df(input_df) # load pickled shap values train_shap = pickle.load( open( SHAP_TRAIN_PICKLE, 'rb')) test_shap = pickle.load( open( SHAP_TEST_PICKLE, 'rb')) # take top 10 shap features train_top_feats_descrip = filter_shap(train_shap[:,:-1], train_df, top_n=15) long_shap_feat_df = melt_feat_and_shap(train_shap, train_df_w_labels, train_top_feats_descrip) # long_shap_feat_df.to_csv(os.path.join(OUTPUT_DIR, 'long_shap_feat_df.tsv'), sep="\t", index=False) # train_top_feats_descrip.to_csv(os.path.join(OUTPUT_DIR, 'top15_feat_w_descript.tsv'), sep="\t", index=False)