def get_all_features_data(labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict, medications_df, medications_data_dict, include_medications=True): '''Returns the merged labs, vitals and demographics features into a single table and the data dict''' time_col = parse_time_col(vitals_data_dict) id_cols = parse_id_cols(vitals_data_dict) # merge the labs, vitals and medications if include_medications: highfreq_df = pd.merge(pd.merge(vitals_df, labs_df, on=id_cols + [time_col], how='outer'), medications_df, on=id_cols + [time_col], how='outer') # forward fill medications because the patient is/is not on medication on new time points created by outer join medication_features = parse_feature_cols(medications_data_dict) highfreq_df[id_cols + medication_features] = highfreq_df[ id_cols + medication_features].groupby(id_cols).apply( lambda x: x.fillna(method='pad')).copy() highfreq_df[id_cols + medication_features] = highfreq_df[ id_cols + medication_features].fillna(0) highfreq_data_dict = merge_data_dicts( [labs_data_dict, vitals_data_dict, medications_data_dict]) else: highfreq_df = pd.merge(vitals_df, labs_df, on=id_cols + [time_col], how='outer') highfreq_data_dict = merge_data_dicts( [labs_data_dict, vitals_data_dict]) highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields'] cols_to_keep = parse_id_cols(highfreq_data_dict) + [ parse_time_col(highfreq_data_dict) ] + parse_feature_cols(highfreq_data_dict) highfreq_df = highfreq_df[cols_to_keep].copy() # merge the highfrequency features with the static features features_df = pd.merge(highfreq_df, demographics_df, on=id_cols, how='inner') features_data_dict = merge_data_dicts( [highfreq_data_dict, demographics_data_dict]) features_data_dict['fields'] = features_data_dict['schema']['fields'] return features_df, features_data_dict
def get_all_features_data(labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict): '''Returns the merged labs, vitals and demographics features into a single table and the data dict''' time_col = parse_time_col(vitals_data_dict) id_cols = parse_id_cols(vitals_data_dict) # merge the labs and vitals highfreq_df = pd.merge(vitals_df, labs_df, on=id_cols +[time_col], how='outer') highfreq_data_dict = merge_data_dicts([labs_data_dict, vitals_data_dict]) highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields'] cols_to_keep = parse_id_cols(highfreq_data_dict) + [parse_time_col(highfreq_data_dict)] + parse_feature_cols(highfreq_data_dict) highfreq_df = highfreq_df[cols_to_keep].copy() # merge the highfrequency features with the static features features_df = pd.merge(highfreq_df, demographics_df, on=id_cols, how='inner') features_data_dict = merge_data_dicts([highfreq_data_dict, demographics_data_dict]) features_data_dict['fields'] = features_data_dict['schema']['fields'] return features_df, features_data_dict
def compute_mews(ts_df, args, mews_df): id_cols = parse_id_cols(args.data_dict) id_cols = remove_col_names_from_list_if_not_in_df(id_cols, ts_df) feature_cols = ['systolic_blood_pressure', 'heart_rate', 'respiratory_rate', 'body_temperature'] time_col = parse_time_col(args.data_dict) # Obtain fenceposts based on where any key differs # Be sure keys are converted to a numerical datatype (so fencepost detection is possible) keys_df = ts_df[id_cols].copy() for col in id_cols: if not pd.api.types.is_numeric_dtype(keys_df[col].dtype): keys_df[col] = keys_df[col].astype('category') keys_df[col] = keys_df[col].cat.codes fp = np.hstack([0, 1 + np.flatnonzero(np.diff(keys_df.values, axis=0).any(axis=1)), keys_df.shape[0]]) nrows = len(fp)- 1 timestamp_arr = np.asarray(ts_df[time_col].values.copy(), dtype=np.float64) mews_scores = np.zeros(nrows) # impute missing values per feature to population median for that feature ts_df_imputed = ts_df.groupby(id_cols).apply(lambda x: x.fillna(method='pad')) ts_df_imputed.fillna(ts_df_imputed.median(), inplace=True) mews_features_df = ts_df_imputed[feature_cols].copy() #print('Computing mews score in first %s hours of data'%(args.max_time_step)) pbar=ProgressBar() for p in pbar(range(nrows)): # get the data for the current fencepost fp_start = fp[p] fp_end = fp[p+1] cur_timestamp_arr = timestamp_arr[fp_start:fp_end] cur_features_df = mews_features_df.iloc[fp_start:fp_end,:].reset_index(drop=True) cur_mews_scores = np.zeros(len(cur_timestamp_arr)) for feature in feature_cols: feature_vals_np = cur_features_df[feature].astype(float) mews_df_cur_feature = mews_df[mews_df['vital']==feature].reset_index(drop=True) feature_maxrange_np = mews_df_cur_feature['range_max'].to_numpy().astype(float) scores_idx = np.searchsorted(feature_maxrange_np, feature_vals_np) cur_mews_scores += mews_df_cur_feature.loc[scores_idx, 'score'].to_numpy().astype(float) # set mews score as last observed mews score over all timesteps #mews_scores[p]=np.median(cur_mews_scores) mews_scores[p]=cur_mews_scores[-1] mews_scores_df = pd.DataFrame(data=mews_scores, columns=['mews_score']) for col_name in id_cols[::-1]: mews_scores_df.insert(0, col_name, ts_df[col_name].values[fp[:-1]].copy()) return mews_scores_df
# get the train test features x_train_csv = os.path.join(args.train_test_split_dir, 'x_train.csv.gz') x_valid_csv = os.path.join(args.train_test_split_dir, 'x_valid.csv.gz') x_test_csv = os.path.join(args.train_test_split_dir, 'x_test.csv.gz') x_dict_json = os.path.join(args.train_test_split_dir, 'x_dict.json') # impute values by carry forward and then pop mean on train and test sets separately x_data_dict = load_data_dict_json(x_dict_json) x_train_df = pd.read_csv(x_train_csv) x_valid_df = pd.read_csv(x_valid_csv) x_test_df = pd.read_csv(x_test_csv) id_cols = parse_id_cols(x_data_dict) feature_cols = parse_feature_cols(x_data_dict) time_col = parse_time_col(x_data_dict) # add mask features non_medication_feature_cols = [ feature_col for feature_col in feature_cols if 'medication' not in feature_col ] medication_feature_cols = [ feature_col for feature_col in feature_cols if 'medication' in feature_col ] print('Adding missing values mask as features...') for feature_col in non_medication_feature_cols: x_train_df.loc[:, 'mask_' + feature_col] = (~x_train_df[feature_col].isna()) * 1.0
try: vitals_data_dict['fields'] = vitals_data_dict['schema']['fields'] except KeyError: pass with open(labs_data_dict_json, 'r') as f: labs_data_dict = json.load(f) try: labs_data_dict['fields'] = labs_data_dict['schema']['fields'] except KeyError: pass id_cols = parse_id_cols(vitals_data_dict) vitals = parse_feature_cols(vitals_data_dict) labs = parse_feature_cols(labs_data_dict) time_col = parse_time_col(vitals_data_dict) # compute missingness per stay vital_counts_per_stay_df = df_vitals.groupby(id_cols).count() print('#######################################') print('MISSINGNESS OF VITALS OVER FULL STAYS : ') vitals_missing_rate_entire_stay_dict = dict() for vital in vitals: vitals_missing_rate_entire_stay_dict[vital] = ( (vital_counts_per_stay_df[vital] == 0).sum()) / vital_counts_per_stay_df.shape[0] vitals_missing_rate_entire_stay_series = pd.Series( vitals_missing_rate_entire_stay_dict) print(vitals_missing_rate_entire_stay_series) lab_counts_per_stay_df = df_labs.groupby(id_cols).count()