y_test_ids_df, on=id_cols) test_labs_df = pd.merge(collapsed_labs_df, y_test_ids_df, on=id_cols) test_mews_df = pd.merge(mews_df, y_test_ids_df, on=id_cols) # merge them test_collapsed_features_df = pd.merge(test_vitals_df, test_labs_df, on=id_cols, how='inner') test_features_df = pd.merge(test_collapsed_features_df, demographics_df, on=id_cols) if p == 0: test_features_dict = merge_data_dicts([ collapsed_vitals_data_dict, collapsed_labs_data_dict, demographics_data_dict ]) test_outcomes_df = pd.merge(test_features_df[id_cols], outcomes_df, on=id_cols, how='inner') # # get performance metrics feature_cols = parse_feature_cols(test_features_dict['schema']) mews_score_col = parse_feature_cols(mews_data_dict['schema']) x_test = test_features_df[feature_cols].values y_test = test_outcomes_df[outcome_col].values mews_test = test_mews_df[mews_score_col].values # bootstrap test set inds without replacement
chosen_stay_ids_df = chosen_stay_ids_df.drop_duplicates(subset=id_cols).reset_index(drop=True) # for each patient get their vitals, labs, demographics labs_df, labs_data_dict, vitals_df, vitals_data_dict, \ demographics_df, demographics_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data(args.preproc_data_dir) vitals = parse_feature_cols(vitals_data_dict) labs = parse_feature_cols(labs_data_dict) chosen_stay_labs_df = pd.merge(labs_df, chosen_stay_ids_df, on=id_cols, how='inner') chosen_stay_vitals_df = pd.merge(vitals_df, chosen_stay_ids_df, on=id_cols, how='inner') chosen_stay_highfreq_df = pd.merge(chosen_stay_labs_df, chosen_stay_vitals_df, on = id_cols + ['hours_since_admission', 'timestamp'], how='outer') highfreq_features_dict = merge_data_dicts([labs_data_dict, vitals_data_dict]) highfreq_features_dict['fields'] = highfreq_features_dict['schema']['fields'] # choose a subject chosen_short_stay_subj_list = ['14343967', '18115638', '18826316', '17245153', '17557700', '11212084', '11163358', '17684794', '12751842', '11528888', '1379931', '17745211', '12862019', '14201044', '14917356', '17682462', '1339889', '17995864', '15787542', '13007083', '18239690', '11692208', '19352552', '19438165', '14858518'] chosen_long_stay_subj_list = ['12702290', '19160387', '19806342', '19222313', '17863017'] chosen_stay_subj_list = chosen_short_stay_subj_list + chosen_long_stay_subj_list # chosen_stay_subj_list = chosen_long_stay_subj_list[0:2] for idx in chosen_stay_subj_list:
else: test_collapsed_features_df = pd.merge(test_vitals_df, test_labs_df, on=id_cols, how='inner') data_dicts_list = [ collapsed_vitals_data_dict, collapsed_labs_data_dict, demographics_data_dict ] test_features_df = pd.merge(test_collapsed_features_df, demographics_df, on=id_cols) if p == 0: test_features_dict = merge_data_dicts(data_dicts_list) test_outcomes_df = pd.merge(test_features_df[id_cols], outcomes_df, on=id_cols, how='inner') # # get performance metrics feature_cols = parse_feature_cols(test_features_dict['schema']) mews_score_col = parse_feature_cols(mews_data_dict['schema']) x_test = test_features_df[feature_cols].values y_test = test_outcomes_df[outcome_col].values # load the scaler scaler = pickle.load( open(os.path.join(clf_models_dir, 'scaler.pkl'), 'rb'))