Beispiel #1
0
parser.add_argument('--index_chunk', type=int, default=None)
parser.add_argument('--output_to_disk', action='store_true')
args = parser.parse_args()
tbl_name = args.tbl_name
version = args.version
index_chunk = args.index_chunk
output_to_disk = args.output_to_disk

data_path = os.path.join(preproc_utils.datapath, '1a_hdf5_clean', version)
input_path = os.path.join(
    data_path, 'datetime_fixed' if tbl_name == 'pharmarec' else 'oor_removed',
    tbl_name)
output_path = os.path.join(data_path, 'duplicates_removed', tbl_name)
if not os.path.exists(output_path) and output_to_disk:
    os.makedirs(output_path)
pid_chunkfile_index = preproc_utils.get_chunking_info(version=version)
pid_list = np.array(pid_chunkfile_index.index[
    pid_chunkfile_index.ChunkfileIndex == index_chunk])
output_path = os.path.join(
    output_path, '%s_%d_%d--%d.h5' %
    (tbl_name, index_chunk, np.min(pid_list), np.max(pid_list)))

voi = preproc_utils.voi_id_name_mapping(tbl_name, True, version=version)
if tbl_name == 'pharmarec':
    for n, pid in enumerate(pid_list):
        filename = [
            f for f in os.listdir(input_path)
            if '%s_%d_' % (tbl_name, index_chunk) in f
        ][0]
        df = pd.read_hdf(os.path.join(input_path, filename),
                         where='PatientID=%d' % pid)
Beispiel #2
0
def main():
    data_path = os.path.join(preproc_utils.datapath, '1a_hdf5_clean', version)
    input_path = os.path.join(data_path, 'datetime_fixed', tbl_name)
    output_path = os.path.join(data_path, 'oor_removed', tbl_name)
    if not os.path.exists(output_path) and output_to_disk:
        os.makedirs(output_path)

    pid_chunkfile_index = preproc_utils.get_chunking_info(version=version)
    pid_list = np.array(pid_chunkfile_index.index[
        pid_chunkfile_index.ChunkfileIndex == index_chunk])
    output_path = os.path.join(
        output_path, '%s_%d_%d--%d.h5' %
        (tbl_name, index_chunk, np.min(pid_list), np.max(pid_list)))

    # Load the global std values for all variables
    voi = preproc_utils.voi_id_name_mapping(tbl_name,
                                            replace_name=True,
                                            version=version)
    if tbl_name == 'labres':
        voi['VariableName'] = voi.VariableName.apply(
            lambda x: x.replace("'", ""))
        voi.loc[24000737, 'VariableName'] = 'v-SO2'
        vid_with_reasonable_0 = voi.index[voi.LowerBound < 0]
    else:
        vid_with_reasonable_0 = voi.index[np.logical_or(
            np.logical_or(voi.LowerBound < 0, voi.NormalValue == 0),
            voi.MetaVariableUnit.apply(
                lambda x: ('ordinal' in x.lower() or 'categorical' in x.lower(
                ) or 'yes/no' in x.lower()) if type(x) == str else False))]
    vid_with_nonsense_0 = list(set(voi.index) - set(vid_with_reasonable_0))

    df_idx_start = 0
    num_pid = len(pid_list)
    df_height = []
    cnt_pid_urine = 0
    for i, pid in enumerate(pid_list):
        filename = [
            f for f in os.listdir(input_path)
            if '%s_%d_' % (tbl_name, index_chunk) in f
        ][0]
        df = pd.read_hdf(os.path.join(input_path, filename),
                         where='PatientID=%d' % pid,
                         mode='r')

        if len(df) == 0:
            print('Patient', pid, 'have no data in %s' % tbl_name)
            continue

        # rename columns of the pharmarec table
        df.rename(columns={
            'PharmaID': 'VariableID',
            'GiveDose': 'Value',
            'DateTime': 'Datetime',
            'SampleTime': 'Datetime'
        },
                  inplace=True)

        # select only variables of interest
        vid_intersect = set(df.VariableID) & set(voi.index)

        # add height (10000450) to the variables of interests if the table is 'observrec'
        if tbl_name == 'observrec':
            vid_intersect |= {10000450}
        elif tbl_name == 'dervals':
            vid_intersect |= {
                830005420, 30015110, 30015010, 30015075, 30015080
            }

        df.drop(df.index[~df.VariableID.isin(vid_intersect)], inplace=True)
        gc.collect()

        if len(df) == 0:
            print('Patient', pid, 'have no data of interest in %s' % tbl_name)
            continue

        # Only remove value 0 for variables for which 0 doesn't have any clinical meaning
        if tbl_name not in ['dervals', 'pharmarec']:
            index_nonsense_0 = df.index[np.logical_and(
                df.Value == 0, df.VariableID.isin(vid_with_nonsense_0))]
            # if len(index_nonsense_0) > 0:
            #     print('Patient', pid, 'has non-sense 0 records.')
            #     print(df.loc[index_nonsense_0])
            #     df.drop(index_nonsense_0, inplace=True)

        # remove records with status containing 2 (invalidated)
        df = remove_records_with_invalid_status(df, tbl_name)

        df.sort_values(by=['Datetime', 'VariableID', 'EnterTime'],
                       inplace=True)

        # remove identical records
        df.drop_duplicates(['Datetime', 'VariableID', 'Value', 'Status'],
                           inplace=True)

        if tbl_name == 'labres':
            monvals_svo2_path = os.path.join(
                data_path, 'datetime_fixed', 'monvals_svo2',
                'monvals_svo2_%d_%d--%d.h5' %
                (index_chunk, np.min(pid_list), np.max(pid_list)))
            df = change_arterial_to_venous(df, voi, tbl_name, index_chunk,
                                           monvals_svo2_path)
            df.drop_duplicates(['Datetime', 'VariableID', 'Value'],
                               inplace=True)
            # fixed troponin conversion
            if 24000538 in df.VariableID.unique():
                df.loc[df.index[df.VariableID == 24000538], 'Value'] = df[
                    df.VariableID == 24000538].Value.values * 1000
            if 24000806 in df.VariableID.unique():
                if df[df.VariableID ==
                      24000806].Datetime.min() <= np.datetime64('2016-05-01'):
                    idx_to_convert = df.index[np.logical_and(
                        df.VariableID == 24000806,
                        df.Datetime <= np.datetime64('2016-05-01'))]
                    df.loc[idx_to_convert,
                           'Value'] = df.loc[idx_to_convert, 'Value'] * 1000

        elif tbl_name == 'dervals':
            cumulative_variable_ids = set(voi.index[voi.VariableName.apply(
                lambda x: 'cumul' in x or '/c' in x)])
            cumulative_variable_ids &= set(df.VariableID.tolist())
            if len(cumulative_variable_ids) == 0:
                pass
                # print('Patient', pid, 'does not have cumulative dervals variables.')
            else:
                cumulative_variable_ids = np.sort(
                    list(cumulative_variable_ids))
                df = cumul_val_to_rate(df, cumulative_variable_ids)

        elif tbl_name == 'observrec':
            cumulative_variable_ids = set(voi.index[voi.VariableName.apply(
                lambda x: 'cumul' in x or '/c' in x)])
            cumulative_variable_ids &= set(df.VariableID.tolist())
            if len(cumulative_variable_ids) == 0:
                pass
                # print('Patient', pid, 'have cumulative observrec variables.')
            else:
                cnt_pid_urine += 1
                cumulative_variable_ids = np.sort(
                    list(cumulative_variable_ids))
                df_old = df.copy()
                df = cumul_val_to_rate(df, cumulative_variable_ids)
                assert (df.Value.max() != float('Inf'))
                df = correct_weight_height(df)
                df_height.append(df[df.VariableID == 10000450].copy())
                df.drop(df.index[df.VariableID == 10000450], inplace=True)
                df = increase_categorical_counter_to_merge(df)

        df = remove_out_of_range(df, voi, tbl_name, index_chunk)
        df.drop(df.index[~df.VariableID.isin(vid_intersect)], inplace=True)

        df.set_index(np.arange(df_idx_start, df_idx_start + len(df)),
                     drop=True,
                     inplace=True)

        df_idx_start += len(df)
        if output_to_disk:
            df.to_hdf(output_path,
                      'data',
                      append=True,
                      format='table',
                      data_columns=['PatientID', 'VariableID'],
                      complevel=5,
                      complib='blosc:lz4')

        if (i + 1) % 50 == 0:
            print('%d / %d' % (i + 1, num_pid))

        gc.collect()
    print(cnt_pid_urine)

    if tbl_name == 'observrec' and output_to_disk:
        height_info_path = os.path.join(data_path, 'oor_removed', 'height')
        if not os.path.exists(height_info_path):
            os.mkdir(height_info_path)

        df_height = pd.concat(df_height, axis=0)
        df_height.reset_index(inplace=True, drop=True)

        df_height.to_hdf(os.path.join(
            height_info_path, 'height_%d_%d--%d.h5' %
            (index_chunk, np.min(pid_list), np.max(pid_list))),
                         'data',
                         data_columns=['PatientID'],
                         complevel=5,
                         complib='blosc:lz4')