def compute_patient_batches(configs): pid_list = mlhc_io.read_list_from_file(configs["included_pid_path"]) pid_list = list(map(int, pid_list)) print("Number of PID stays in the database: {}".format(len(pid_list))) batch_to_list_dict = {} pid_to_batch_dict = {} for batch_run_idx, base_idx in enumerate(range(0, len(pid_list), configs["batch_size_patients_per_file"])): sub_pid_list = pid_list[base_idx:base_idx+configs["batch_size_patients_per_file"]] batch_to_list_dict[batch_run_idx] = pid_list[base_idx:base_idx+configs["batch_size_patients_per_file"]] for pid in sub_pid_list: pid_to_batch_dict[pid] = batch_run_idx pickle_obj = {"batch_to_lst": batch_to_list_dict, "pid_to_batch": pid_to_batch_dict} with open(configs["output_path"], 'wb') as fp: pickle.dump(pickle_obj, fp)
def filter_patients(configs): pid_list = mlhc_io.read_list_from_file(configs["all_pid_stay_path"]) print("Number of PID stays in the database: {}".format(len(pid_list))) included_patients = [] for pidx, pid in enumerate(pid_list): if (pidx + 1) % 100 == 0: print("PID: {}/{}".format(pidx + 1, len(pid_list))) print("Included patient stays: {}/{}".format(\ len(included_patients), pidx+1)) df_vs = pd.read_hdf(configs["vital_per_table_path"], mode='r', where="patientunitstayid={}".format(pid)) df_vs.sort_values(by="observationoffset", inplace=True, kind="mergesort") hr_col = df_vs[["observationoffset", "heartrate"]].dropna() diffed_hr_col = hr_col["observationoffset"].diff() min_ts = hr_col.observationoffset.min() max_ts = hr_col.observationoffset.max() segment_hours = (max_ts - min_ts) / 60.0 # Exclude stays longer than 30 days or shorter than 1 day if np.isnan(segment_hours) or segment_hours < 24*configs[\ "min_length_days"] or segment_hours > configs[\ "max_length_days"]*24: continue max_disconnect_mins = int(diffed_hr_col.max()) # Exclude stays where the HR sensor is disconnected for more than 60 # minutes. if max_disconnect_mins > configs["max_hr_disconnect_mins"]: continue included_patients.append(pid) sorted_inc_pids = list(sorted(included_patients)) mlhc_io.write_list_to_file(configs["output_path"], sorted_inc_pids)
def save_variable_quantiles(configs): ''' Saves the quantiles of all variables in the LAB/VITAL SIGN tables''' all_pids = mlhc_io.read_list_from_file(configs["included_pid_path"]) vital_per_variables = mlhc_io.read_list_from_file(\ configs["list_per_variables"]) vital_aper_variables = mlhc_io.read_list_from_file(\ configs["list_aper_variables"]) var_quantiles = {} print("Lab table...") df_lab = pd.read_hdf(configs["lab_table_path"], mode='r') df_lab = df_lab[df_lab.patientunitstayid.isin(all_pids)] print("Loaded lab table with {} rows".format(df_lab.shape[0])) all_lab_vars = df_lab.labname.unique() for lab_var in all_lab_vars: print("Lab variable: {}".format(lab_var)) df_var = df_lab[df_lab.labname == lab_var] var_quantiles["lab_" + lab_var] = [] for quantile in np.arange(0.01, 1.00, 0.01): quant_val = df_var.labresult.quantile(quantile) var_quantiles["lab_" + lab_var].append(quant_val) print("List length: {}".format(len(var_quantiles["lab_" + lab_var]))) gc.collect() print("Vital periodic table...") found_eof = False ct = 0 blk_sz = 1000000 df_cts = [] num_records = 0 while not found_eof: df_ct = pd.read_hdf(configs["vital_per_path"], mode='r', start=ct * blk_sz, stop=(ct + 1) * blk_sz) if len(df_ct) != blk_sz: found_eof = True df_ct = df_ct[df_ct.patientunitstayid.isin(all_pids)] df_cts.append(df_ct) num_records += len(df_ct) ct += 1 print("%d Vital periodic records loaded" % num_records) df_vital_per = pd.concat(df_cts) print("Loaded vital periodic table with {} rows".format(\ df_vital_per.shape[0])) for per_var in vital_per_variables: print("Periodic variable: {}".format(per_var)) df_col = df_vital_per[per_var] var_quantiles["periodic_" + per_var] = [] for quantile in np.arange(0.01, 1.00, 0.01): quant_val = df_col.quantile(quantile) var_quantiles["periodic_" + per_var].append(quant_val) print("List length: {}".format(\ len(var_quantiles["periodic_"+per_var]))) gc.collect() print("Vital aperiodic table...") df_vital_aper = pd.read_hdf(configs["vital_aper_path"], mode='r') df_vital_aper = df_vital_aper[df_vital_aper.patientunitstayid.isin(\ all_pids)] print("Loaded vital aperiodic table with {} rows".format(\ df_vital_aper.shape[0])) for aper_var in vital_aper_variables: print("Aperiodic variable: {}".format(aper_var)) df_col = df_vital_aper[aper_var] var_quantiles["aperiodic_" + aper_var] = [] for quantile in np.arange(0.01, 1.00, 0.01): quant_val = df_col.quantile(quantile) var_quantiles["aperiodic_" + aper_var].append(quant_val) print("List length: {}".format(\ len(var_quantiles["aperiodic_"+aper_var]))) gc.collect() quantile_fp = open(configs["quantile_path"], mode='w') json.dump(var_quantiles, quantile_fp) quantile_fp.close()
def filter_variables(configs): vital_variables = ["temperature", "sao2", "heartrate", "respiration", "cvp", "etco2", "systemicsystolic", "systemicdiastolic", "systemicmean", "pasystolic", "padiastolic", "pamean", "st1", "st2", "st3", "icp"] vital_aper_variables = ["noninvasivesystolic", "noninvasivediastolic", "noninvasivemean", "paop", "cardiacoutput", "cardiacinput", "svr", "svri", "pvr", "pvri"] ptable_path = os.path.join(configs["hdf_dir"], "patient.h5") periodic_path = os.path.join(configs["hdf_dir"], "vitalPeriodic.h5") aperiodic_path = os.path.join(configs["hdf_dir"], "vitalAperiodic.h5") lab_path = os.path.join(configs["hdf_dir"], "lab.h5") input_files = [periodic_path, aperiodic_path, lab_path] output_files = [configs["output_selected_per_vars"], configs["output_selected_aper_vars"], configs["output_selected_lab_vars"]] all_pids = list(map(int, mlhc_io.read_list_from_file(configs["included_pid_path"]))) var_obs_count_dict = {} aper_var_obs_count_dict = {} lab_var_obs_count_dict = {} if configs["debug_mode"]: base_size = 1000 else: base_size = len(all_pids) for pidx, pat in enumerate(all_pids): if (pidx+1) % 1000 == 0: print("Patient {}/{}".format(pidx+1, len(all_pids))) if configs["debug_mode"]: break df_periodic = pd.read_hdf(periodic_path, mode='r', where="patientunitstayid={}".format(pat)) df_aperiodic = pd.read_hdf(aperiodic_path, mode='r', where="patientunitstayid={}".format(pat)) df_lab = pd.read_hdf(lab_path, mode='r', where="patientunitstayid={}".format(pat))[["labname", "labresult"]].dropna() unique_lab_vars = list(map(lambda elem: elem.strip(), list(df_lab.labname.unique()))) for var in unique_lab_vars: if var not in lab_var_obs_count_dict: lab_var_obs_count_dict[var] = 0 lab_var_obs_count_dict[var] += 1 for var in vital_variables: df_var = df_periodic[var].dropna() if df_var.shape[0] > 0: if var not in var_obs_count_dict: var_obs_count_dict[var] = 0 var_obs_count_dict[var] += 1 for var in vital_aper_variables: df_var = df_aperiodic[var].dropna() if df_var.shape[0] > 0: if var not in aper_var_obs_count_dict: aper_var_obs_count_dict[var] = 0 aper_var_obs_count_dict[var] += 1 non_selected_vars = [] per_selected_vars = [] for var in sorted(var_obs_count_dict.keys()): percentage = var_obs_count_dict[var]/base_size if percentage >= configs["required_var_freq"]: per_selected_vars.append(var) else: non_selected_vars.append(var) non_selected_vars = [] aper_selected_vars = [] for var in sorted(aper_var_obs_count_dict.keys()): percentage = aper_var_obs_count_dict[var]/base_size if percentage >= configs["required_var_freq"]: aper_selected_vars.append(var) else: non_selected_vars.append(var) non_selected_vars = [] lab_selected_vars = [] for var in sorted(lab_var_obs_count_dict.keys()): percentage = lab_var_obs_count_dict[var]/base_size if percentage >= configs["required_var_freq"]: lab_selected_vars.append(var) else: non_selected_vars.append(var) mlhc_io.write_list_to_file(configs["output_selected_per_vars"], per_selected_vars) mlhc_io.write_list_to_file(configs["output_selected_aper_vars"], aper_selected_vars) mlhc_io.write_list_to_file(configs["output_selected_lab_vars"], lab_selected_vars) for var in per_selected_vars: print("Analyzing variable: {}".format(var)) df_var = pd.read_hdf(periodic_path, mode='r', columns=[var, "patientunitstayid"]).dropna() df_var = df_var[df_var["patientunitstayid"].isin(all_pids)][var] f, axarr = plt.subplots(2) lower_cutoff = np.percentile(np.array(df_var), 0.1) upper_cutoff = np.percentile(np.array(df_var), 99.9) df_var.plot.hist(bins=100, ax=axarr[0], log=True, range=(lower_cutoff, upper_cutoff)) df_var.plot.box(ax=axarr[1], sym="", vert=False) plt.clf() gc.collect() lower = np.percentile(np.array(df_var), 25)-5*sp_stats.iqr(np.array(df_var)) upper = np.percentile(np.array(df_var), 75)+5*sp_stats.iqr(np.array(df_var)) normal = np.median(np.array(df_var)) if configs["debug_mode"]: break for var in aper_selected_vars: print("Analyzing variable: {}".format(var)) df_var = pd.read_hdf(aperiodic_path, mode='r', columns=[var, "patientunitstayid"]).dropna() df_var = df_var[df_var["patientunitstayid"].isin(all_pids)][var] f, axarr = plt.subplots(2) lower_cutoff = np.percentile(np.array(df_var), 0.1) upper_cutoff = np.percentile(np.array(df_var), 99.9) df_var.plot.hist(bins=100, ax=axarr[0], log=True, range=(lower_cutoff, upper_cutoff)) df_var.plot.box(ax=axarr[1], sym="", vert=False) plt.clf() gc.collect() lower = np.percentile(np.array(df_var), 25)-5*sp_stats.iqr(np.array(df_var)) upper = np.percentile(np.array(df_var), 75)+5*sp_stats.iqr(np.array(df_var)) normal = np.median(np.array(df_var)) if configs["debug_mode"]: break df_all_vars = pd.read_hdf(lab_path, mode='r', columns=["labname", "labresult", "patientunitstayid"]).dropna() df_all_vars = df_all_vars[df_all_vars["patientunitstayid"].isin(all_pids)] for var in lab_selected_vars: print("Analyzing variable: {}".format(var)) df_var = df_all_vars[df_all_vars["labname"] == var]["labresult"] f, axarr = plt.subplots(2) lower_cutoff = np.percentile(np.array(df_var), 0.1) upper_cutoff = np.percentile(np.array(df_var), 99.9) df_var.plot.hist(bins=100, ax=axarr[0], range=(lower_cutoff, upper_cutoff), log=True) df_var.plot.box(ax=axarr[1], sym="", vert=False) plt.clf() gc.collect() lower = np.percentile(np.array(df_var), 25)-5*sp_stats.iqr(np.array(df_var)) upper = np.percentile(np.array(df_var), 75)+5*sp_stats.iqr(np.array(df_var)) normal = np.median(np.array(df_var)) if configs["debug_mode"]: break
def save_variable_quantiles(configs): ''' Saves the quantiles of all variables in the LAB/VITAL SIGN tables''' all_pids = mlhc_io.read_list_from_file(configs["included_pid_path"]) vital_per_variables = mlhc_io.read_list_from_file( configs["list_per_variables"]) vital_aper_variables = mlhc_io.read_list_from_file( configs["list_aper_variables"]) var_quantiles = {} print("Lab table...") df_lab = pd.read_hdf(configs["lab_table_path"], mode='r') df_lab = df_lab[df_lab.patientunitstayid.isin(all_pids)] print("Loaded lab table with {} rows".format(df_lab.shape[0])) all_lab_vars = df_lab.labname.unique() for lab_var in all_lab_vars: print("Lab variable: {}".format(lab_var)) df_var = df_lab[df_lab.labname == lab_var] var_quantiles["lab_" + lab_var] = [] for quantile in np.arange(0.01, 1.00, 0.01): quant_val = df_var.labresult.quantile(quantile) var_quantiles["lab_" + lab_var].append(quant_val) print("List length: {}".format(len(var_quantiles["lab_" + lab_var]))) gc.collect() print("Vital periodic table...") df_vital_per = pd.read_hdf(configs["vital_per_path"], mode='r') df_vital_per = df_vital_per[df_vital_per.patientunitstayid.isin(all_pids)] print("Loaded vital periodic table with {} rows".format( df_vital_per.shape[0])) for per_var in vital_per_variables: print("Periodic variable: {}".format(per_var)) df_col = df_vital_per[per_var] var_quantiles["periodic_" + per_var] = [] for quantile in np.arange(0.01, 1.00, 0.01): quant_val = df_col.quantile(quantile) var_quantiles["periodic_" + per_var].append(quant_val) print("List length: {}".format( len(var_quantiles["periodic_" + per_var]))) gc.collect() print("Vital aperiodic table...") df_vital_aper = pd.read_hdf(configs["vital_aper_path"], mode='r') df_vital_aper = df_vital_aper[df_vital_aper.patientunitstayid.isin( all_pids)] print("Loaded vital aperiodic table with {} rows".format( df_vital_aper.shape[0])) for aper_var in vital_aper_variables: print("Aperiodic variable: {}".format(aper_var)) df_col = df_vital_aper[aper_var] var_quantiles["aperiodic_" + aper_var] = [] for quantile in np.arange(0.01, 1.00, 0.01): quant_val = df_col.quantile(quantile) var_quantiles["aperiodic_" + aper_var].append(quant_val) print("List length: {}".format( len(var_quantiles["aperiodic_" + aper_var]))) gc.collect() quantile_fp = open(configs["quantile_path"], mode='w') json.dump(var_quantiles, quantile_fp) quantile_fp.close()