def compute_patient_batches(configs):
    pid_list = mlhc_io.read_list_from_file(configs["included_pid_path"])
    pid_list = list(map(int, pid_list))
    print("Number of PID stays in the database: {}".format(len(pid_list)))
    batch_to_list_dict = {}
    pid_to_batch_dict = {}

    for batch_run_idx, base_idx in enumerate(range(0, len(pid_list), configs["batch_size_patients_per_file"])):
        sub_pid_list = pid_list[base_idx:base_idx+configs["batch_size_patients_per_file"]]
        batch_to_list_dict[batch_run_idx] = pid_list[base_idx:base_idx+configs["batch_size_patients_per_file"]]
        for pid in sub_pid_list:
            pid_to_batch_dict[pid] = batch_run_idx

    pickle_obj = {"batch_to_lst": batch_to_list_dict, "pid_to_batch": pid_to_batch_dict}

    with open(configs["output_path"], 'wb') as fp:
        pickle.dump(pickle_obj, fp)
Exemple #2
0
def filter_patients(configs):
    pid_list = mlhc_io.read_list_from_file(configs["all_pid_stay_path"])
    print("Number of PID stays in the database: {}".format(len(pid_list)))
    included_patients = []

    for pidx, pid in enumerate(pid_list):
        if (pidx + 1) % 100 == 0:
            print("PID: {}/{}".format(pidx + 1, len(pid_list)))
            print("Included patient stays: {}/{}".format(\
                    len(included_patients), pidx+1))
        df_vs = pd.read_hdf(configs["vital_per_table_path"],
                            mode='r',
                            where="patientunitstayid={}".format(pid))
        df_vs.sort_values(by="observationoffset",
                          inplace=True,
                          kind="mergesort")
        hr_col = df_vs[["observationoffset", "heartrate"]].dropna()
        diffed_hr_col = hr_col["observationoffset"].diff()
        min_ts = hr_col.observationoffset.min()
        max_ts = hr_col.observationoffset.max()
        segment_hours = (max_ts - min_ts) / 60.0

        # Exclude stays longer than 30 days or shorter than 1 day
        if np.isnan(segment_hours) or segment_hours < 24*configs[\
                "min_length_days"] or segment_hours > configs[\
                "max_length_days"]*24:
            continue

        max_disconnect_mins = int(diffed_hr_col.max())

        # Exclude stays where the HR sensor is disconnected for more than 60
        # minutes.
        if max_disconnect_mins > configs["max_hr_disconnect_mins"]:
            continue

        included_patients.append(pid)

    sorted_inc_pids = list(sorted(included_patients))
    mlhc_io.write_list_to_file(configs["output_path"], sorted_inc_pids)
Exemple #3
0
def save_variable_quantiles(configs):
    ''' Saves the quantiles of all variables in the LAB/VITAL SIGN tables'''
    all_pids = mlhc_io.read_list_from_file(configs["included_pid_path"])
    vital_per_variables = mlhc_io.read_list_from_file(\
            configs["list_per_variables"])
    vital_aper_variables = mlhc_io.read_list_from_file(\
            configs["list_aper_variables"])
    var_quantiles = {}

    print("Lab table...")
    df_lab = pd.read_hdf(configs["lab_table_path"], mode='r')
    df_lab = df_lab[df_lab.patientunitstayid.isin(all_pids)]
    print("Loaded lab table with {} rows".format(df_lab.shape[0]))
    all_lab_vars = df_lab.labname.unique()

    for lab_var in all_lab_vars:
        print("Lab variable: {}".format(lab_var))
        df_var = df_lab[df_lab.labname == lab_var]
        var_quantiles["lab_" + lab_var] = []

        for quantile in np.arange(0.01, 1.00, 0.01):
            quant_val = df_var.labresult.quantile(quantile)
            var_quantiles["lab_" + lab_var].append(quant_val)

        print("List length: {}".format(len(var_quantiles["lab_" + lab_var])))
    gc.collect()

    print("Vital periodic table...")
    found_eof = False
    ct = 0
    blk_sz = 1000000
    df_cts = []
    num_records = 0
    while not found_eof:
        df_ct = pd.read_hdf(configs["vital_per_path"],
                            mode='r',
                            start=ct * blk_sz,
                            stop=(ct + 1) * blk_sz)
        if len(df_ct) != blk_sz:
            found_eof = True
        df_ct = df_ct[df_ct.patientunitstayid.isin(all_pids)]
        df_cts.append(df_ct)
        num_records += len(df_ct)
        ct += 1
        print("%d Vital periodic records loaded" % num_records)
    df_vital_per = pd.concat(df_cts)
    print("Loaded vital periodic table with {} rows".format(\
            df_vital_per.shape[0]))

    for per_var in vital_per_variables:
        print("Periodic variable: {}".format(per_var))
        df_col = df_vital_per[per_var]
        var_quantiles["periodic_" + per_var] = []

        for quantile in np.arange(0.01, 1.00, 0.01):
            quant_val = df_col.quantile(quantile)
            var_quantiles["periodic_" + per_var].append(quant_val)

        print("List length: {}".format(\
                len(var_quantiles["periodic_"+per_var])))

    gc.collect()
    print("Vital aperiodic table...")
    df_vital_aper = pd.read_hdf(configs["vital_aper_path"], mode='r')
    df_vital_aper = df_vital_aper[df_vital_aper.patientunitstayid.isin(\
            all_pids)]
    print("Loaded vital aperiodic table with {} rows".format(\
            df_vital_aper.shape[0]))

    for aper_var in vital_aper_variables:
        print("Aperiodic variable: {}".format(aper_var))
        df_col = df_vital_aper[aper_var]
        var_quantiles["aperiodic_" + aper_var] = []

        for quantile in np.arange(0.01, 1.00, 0.01):
            quant_val = df_col.quantile(quantile)
            var_quantiles["aperiodic_" + aper_var].append(quant_val)

        print("List length: {}".format(\
                len(var_quantiles["aperiodic_"+aper_var])))

    gc.collect()
    quantile_fp = open(configs["quantile_path"], mode='w')
    json.dump(var_quantiles, quantile_fp)
    quantile_fp.close()
Exemple #4
0
def filter_variables(configs):

    vital_variables = ["temperature", "sao2", "heartrate", "respiration", "cvp", "etco2", "systemicsystolic", "systemicdiastolic",
                       "systemicmean", "pasystolic", "padiastolic", "pamean", "st1", "st2", "st3", "icp"]

    vital_aper_variables = ["noninvasivesystolic", "noninvasivediastolic", "noninvasivemean", "paop", "cardiacoutput", "cardiacinput", "svr",
                            "svri", "pvr", "pvri"]

    ptable_path = os.path.join(configs["hdf_dir"], "patient.h5")
    periodic_path = os.path.join(configs["hdf_dir"], "vitalPeriodic.h5")
    aperiodic_path = os.path.join(configs["hdf_dir"], "vitalAperiodic.h5")
    lab_path = os.path.join(configs["hdf_dir"], "lab.h5")
    input_files = [periodic_path, aperiodic_path, lab_path]
    output_files = [configs["output_selected_per_vars"], configs["output_selected_aper_vars"], configs["output_selected_lab_vars"]]
    all_pids = list(map(int, mlhc_io.read_list_from_file(configs["included_pid_path"])))

    var_obs_count_dict = {}
    aper_var_obs_count_dict = {}
    lab_var_obs_count_dict = {}

    if configs["debug_mode"]:
        base_size = 1000
    else:
        base_size = len(all_pids)

    for pidx, pat in enumerate(all_pids):
        if (pidx+1) % 1000 == 0:
            print("Patient {}/{}".format(pidx+1, len(all_pids)))
            if configs["debug_mode"]:
                break

        df_periodic = pd.read_hdf(periodic_path, mode='r', where="patientunitstayid={}".format(pat))
        df_aperiodic = pd.read_hdf(aperiodic_path, mode='r', where="patientunitstayid={}".format(pat))
        df_lab = pd.read_hdf(lab_path, mode='r', where="patientunitstayid={}".format(pat))[["labname", "labresult"]].dropna()
        unique_lab_vars = list(map(lambda elem: elem.strip(), list(df_lab.labname.unique())))

        for var in unique_lab_vars:
            if var not in lab_var_obs_count_dict:
                lab_var_obs_count_dict[var] = 0
            lab_var_obs_count_dict[var] += 1

        for var in vital_variables:
            df_var = df_periodic[var].dropna()
            if df_var.shape[0] > 0:
                if var not in var_obs_count_dict:
                    var_obs_count_dict[var] = 0
                var_obs_count_dict[var] += 1

        for var in vital_aper_variables:
            df_var = df_aperiodic[var].dropna()
            if df_var.shape[0] > 0:
                if var not in aper_var_obs_count_dict:
                    aper_var_obs_count_dict[var] = 0
                aper_var_obs_count_dict[var] += 1

    non_selected_vars = []
    per_selected_vars = []

    for var in sorted(var_obs_count_dict.keys()):
        percentage = var_obs_count_dict[var]/base_size
        if percentage >= configs["required_var_freq"]:
            per_selected_vars.append(var)
        else:
            non_selected_vars.append(var)

    non_selected_vars = []
    aper_selected_vars = []

    for var in sorted(aper_var_obs_count_dict.keys()):
        percentage = aper_var_obs_count_dict[var]/base_size
        if percentage >= configs["required_var_freq"]:
            aper_selected_vars.append(var)
        else:
            non_selected_vars.append(var)

    non_selected_vars = []
    lab_selected_vars = []

    for var in sorted(lab_var_obs_count_dict.keys()):
        percentage = lab_var_obs_count_dict[var]/base_size
        if percentage >= configs["required_var_freq"]:
            lab_selected_vars.append(var)
        else:
            non_selected_vars.append(var)

    mlhc_io.write_list_to_file(configs["output_selected_per_vars"], per_selected_vars)
    mlhc_io.write_list_to_file(configs["output_selected_aper_vars"], aper_selected_vars)
    mlhc_io.write_list_to_file(configs["output_selected_lab_vars"], lab_selected_vars)

    for var in per_selected_vars:
        print("Analyzing variable: {}".format(var))
        df_var = pd.read_hdf(periodic_path, mode='r', columns=[var, "patientunitstayid"]).dropna()
        df_var = df_var[df_var["patientunitstayid"].isin(all_pids)][var]
        f, axarr = plt.subplots(2)
        lower_cutoff = np.percentile(np.array(df_var), 0.1)
        upper_cutoff = np.percentile(np.array(df_var), 99.9)
        df_var.plot.hist(bins=100, ax=axarr[0], log=True, range=(lower_cutoff, upper_cutoff))
        df_var.plot.box(ax=axarr[1], sym="", vert=False)
        plt.clf()
        gc.collect()
        lower = np.percentile(np.array(df_var), 25)-5*sp_stats.iqr(np.array(df_var))
        upper = np.percentile(np.array(df_var), 75)+5*sp_stats.iqr(np.array(df_var))
        normal = np.median(np.array(df_var))
        if configs["debug_mode"]:
            break

    for var in aper_selected_vars:
        print("Analyzing variable: {}".format(var))
        df_var = pd.read_hdf(aperiodic_path, mode='r', columns=[var, "patientunitstayid"]).dropna()
        df_var = df_var[df_var["patientunitstayid"].isin(all_pids)][var]
        f, axarr = plt.subplots(2)
        lower_cutoff = np.percentile(np.array(df_var), 0.1)
        upper_cutoff = np.percentile(np.array(df_var), 99.9)
        df_var.plot.hist(bins=100, ax=axarr[0], log=True, range=(lower_cutoff, upper_cutoff))
        df_var.plot.box(ax=axarr[1], sym="", vert=False)
        plt.clf()
        gc.collect()
        lower = np.percentile(np.array(df_var), 25)-5*sp_stats.iqr(np.array(df_var))
        upper = np.percentile(np.array(df_var), 75)+5*sp_stats.iqr(np.array(df_var))
        normal = np.median(np.array(df_var))
        if configs["debug_mode"]:
            break

    df_all_vars = pd.read_hdf(lab_path, mode='r', columns=["labname", "labresult", "patientunitstayid"]).dropna()
    df_all_vars = df_all_vars[df_all_vars["patientunitstayid"].isin(all_pids)]

    for var in lab_selected_vars:
        print("Analyzing variable: {}".format(var))
        df_var = df_all_vars[df_all_vars["labname"] == var]["labresult"]
        f, axarr = plt.subplots(2)
        lower_cutoff = np.percentile(np.array(df_var), 0.1)
        upper_cutoff = np.percentile(np.array(df_var), 99.9)
        df_var.plot.hist(bins=100, ax=axarr[0], range=(lower_cutoff, upper_cutoff), log=True)
        df_var.plot.box(ax=axarr[1], sym="", vert=False)
        plt.clf()
        gc.collect()
        lower = np.percentile(np.array(df_var), 25)-5*sp_stats.iqr(np.array(df_var))
        upper = np.percentile(np.array(df_var), 75)+5*sp_stats.iqr(np.array(df_var))
        normal = np.median(np.array(df_var))
        if configs["debug_mode"]:
            break
def save_variable_quantiles(configs):
    ''' Saves the quantiles of all variables in the LAB/VITAL SIGN tables'''
    all_pids = mlhc_io.read_list_from_file(configs["included_pid_path"])
    vital_per_variables = mlhc_io.read_list_from_file(
        configs["list_per_variables"])
    vital_aper_variables = mlhc_io.read_list_from_file(
        configs["list_aper_variables"])
    var_quantiles = {}

    print("Lab table...")
    df_lab = pd.read_hdf(configs["lab_table_path"], mode='r')
    df_lab = df_lab[df_lab.patientunitstayid.isin(all_pids)]
    print("Loaded lab table with {} rows".format(df_lab.shape[0]))
    all_lab_vars = df_lab.labname.unique()

    for lab_var in all_lab_vars:
        print("Lab variable: {}".format(lab_var))
        df_var = df_lab[df_lab.labname == lab_var]
        var_quantiles["lab_" + lab_var] = []

        for quantile in np.arange(0.01, 1.00, 0.01):
            quant_val = df_var.labresult.quantile(quantile)
            var_quantiles["lab_" + lab_var].append(quant_val)

        print("List length: {}".format(len(var_quantiles["lab_" + lab_var])))

    gc.collect()
    print("Vital periodic table...")
    df_vital_per = pd.read_hdf(configs["vital_per_path"], mode='r')
    df_vital_per = df_vital_per[df_vital_per.patientunitstayid.isin(all_pids)]
    print("Loaded vital periodic table with {} rows".format(
        df_vital_per.shape[0]))

    for per_var in vital_per_variables:
        print("Periodic variable: {}".format(per_var))
        df_col = df_vital_per[per_var]
        var_quantiles["periodic_" + per_var] = []

        for quantile in np.arange(0.01, 1.00, 0.01):
            quant_val = df_col.quantile(quantile)
            var_quantiles["periodic_" + per_var].append(quant_val)

        print("List length: {}".format(
            len(var_quantiles["periodic_" + per_var])))

    gc.collect()
    print("Vital aperiodic table...")
    df_vital_aper = pd.read_hdf(configs["vital_aper_path"], mode='r')
    df_vital_aper = df_vital_aper[df_vital_aper.patientunitstayid.isin(
        all_pids)]
    print("Loaded vital aperiodic table with {} rows".format(
        df_vital_aper.shape[0]))

    for aper_var in vital_aper_variables:
        print("Aperiodic variable: {}".format(aper_var))
        df_col = df_vital_aper[aper_var]
        var_quantiles["aperiodic_" + aper_var] = []

        for quantile in np.arange(0.01, 1.00, 0.01):
            quant_val = df_col.quantile(quantile)
            var_quantiles["aperiodic_" + aper_var].append(quant_val)

        print("List length: {}".format(
            len(var_quantiles["aperiodic_" + aper_var])))

    gc.collect()
    quantile_fp = open(configs["quantile_path"], mode='w')
    json.dump(var_quantiles, quantile_fp)
    quantile_fp.close()