parser.add_argument('--index_chunk', type=int, default=None) parser.add_argument('--output_to_disk', action='store_true') args = parser.parse_args() tbl_name = args.tbl_name version = args.version index_chunk = args.index_chunk output_to_disk = args.output_to_disk data_path = os.path.join(preproc_utils.datapath, '1a_hdf5_clean', version) input_path = os.path.join( data_path, 'datetime_fixed' if tbl_name == 'pharmarec' else 'oor_removed', tbl_name) output_path = os.path.join(data_path, 'duplicates_removed', tbl_name) if not os.path.exists(output_path) and output_to_disk: os.makedirs(output_path) pid_chunkfile_index = preproc_utils.get_chunking_info(version=version) pid_list = np.array(pid_chunkfile_index.index[ pid_chunkfile_index.ChunkfileIndex == index_chunk]) output_path = os.path.join( output_path, '%s_%d_%d--%d.h5' % (tbl_name, index_chunk, np.min(pid_list), np.max(pid_list))) voi = preproc_utils.voi_id_name_mapping(tbl_name, True, version=version) if tbl_name == 'pharmarec': for n, pid in enumerate(pid_list): filename = [ f for f in os.listdir(input_path) if '%s_%d_' % (tbl_name, index_chunk) in f ][0] df = pd.read_hdf(os.path.join(input_path, filename), where='PatientID=%d' % pid)
def main(): data_path = os.path.join(preproc_utils.datapath, '1a_hdf5_clean', version) input_path = os.path.join(data_path, 'datetime_fixed', tbl_name) output_path = os.path.join(data_path, 'oor_removed', tbl_name) if not os.path.exists(output_path) and output_to_disk: os.makedirs(output_path) pid_chunkfile_index = preproc_utils.get_chunking_info(version=version) pid_list = np.array(pid_chunkfile_index.index[ pid_chunkfile_index.ChunkfileIndex == index_chunk]) output_path = os.path.join( output_path, '%s_%d_%d--%d.h5' % (tbl_name, index_chunk, np.min(pid_list), np.max(pid_list))) # Load the global std values for all variables voi = preproc_utils.voi_id_name_mapping(tbl_name, replace_name=True, version=version) if tbl_name == 'labres': voi['VariableName'] = voi.VariableName.apply( lambda x: x.replace("'", "")) voi.loc[24000737, 'VariableName'] = 'v-SO2' vid_with_reasonable_0 = voi.index[voi.LowerBound < 0] else: vid_with_reasonable_0 = voi.index[np.logical_or( np.logical_or(voi.LowerBound < 0, voi.NormalValue == 0), voi.MetaVariableUnit.apply( lambda x: ('ordinal' in x.lower() or 'categorical' in x.lower( ) or 'yes/no' in x.lower()) if type(x) == str else False))] vid_with_nonsense_0 = list(set(voi.index) - set(vid_with_reasonable_0)) df_idx_start = 0 num_pid = len(pid_list) df_height = [] cnt_pid_urine = 0 for i, pid in enumerate(pid_list): filename = [ f for f in os.listdir(input_path) if '%s_%d_' % (tbl_name, index_chunk) in f ][0] df = pd.read_hdf(os.path.join(input_path, filename), where='PatientID=%d' % pid, mode='r') if len(df) == 0: print('Patient', pid, 'have no data in %s' % tbl_name) continue # rename columns of the pharmarec table df.rename(columns={ 'PharmaID': 'VariableID', 'GiveDose': 'Value', 'DateTime': 'Datetime', 'SampleTime': 'Datetime' }, inplace=True) # select only variables of interest vid_intersect = set(df.VariableID) & set(voi.index) # add height (10000450) to the variables of interests if the table is 'observrec' if tbl_name == 'observrec': vid_intersect |= {10000450} elif tbl_name == 'dervals': vid_intersect |= { 830005420, 30015110, 30015010, 30015075, 30015080 } df.drop(df.index[~df.VariableID.isin(vid_intersect)], inplace=True) gc.collect() if len(df) == 0: print('Patient', pid, 'have no data of interest in %s' % tbl_name) continue # Only remove value 0 for variables for which 0 doesn't have any clinical meaning if tbl_name not in ['dervals', 'pharmarec']: index_nonsense_0 = df.index[np.logical_and( df.Value == 0, df.VariableID.isin(vid_with_nonsense_0))] # if len(index_nonsense_0) > 0: # print('Patient', pid, 'has non-sense 0 records.') # print(df.loc[index_nonsense_0]) # df.drop(index_nonsense_0, inplace=True) # remove records with status containing 2 (invalidated) df = remove_records_with_invalid_status(df, tbl_name) df.sort_values(by=['Datetime', 'VariableID', 'EnterTime'], inplace=True) # remove identical records df.drop_duplicates(['Datetime', 'VariableID', 'Value', 'Status'], inplace=True) if tbl_name == 'labres': monvals_svo2_path = os.path.join( data_path, 'datetime_fixed', 'monvals_svo2', 'monvals_svo2_%d_%d--%d.h5' % (index_chunk, np.min(pid_list), np.max(pid_list))) df = change_arterial_to_venous(df, voi, tbl_name, index_chunk, monvals_svo2_path) df.drop_duplicates(['Datetime', 'VariableID', 'Value'], inplace=True) # fixed troponin conversion if 24000538 in df.VariableID.unique(): df.loc[df.index[df.VariableID == 24000538], 'Value'] = df[ df.VariableID == 24000538].Value.values * 1000 if 24000806 in df.VariableID.unique(): if df[df.VariableID == 24000806].Datetime.min() <= np.datetime64('2016-05-01'): idx_to_convert = df.index[np.logical_and( df.VariableID == 24000806, df.Datetime <= np.datetime64('2016-05-01'))] df.loc[idx_to_convert, 'Value'] = df.loc[idx_to_convert, 'Value'] * 1000 elif tbl_name == 'dervals': cumulative_variable_ids = set(voi.index[voi.VariableName.apply( lambda x: 'cumul' in x or '/c' in x)]) cumulative_variable_ids &= set(df.VariableID.tolist()) if len(cumulative_variable_ids) == 0: pass # print('Patient', pid, 'does not have cumulative dervals variables.') else: cumulative_variable_ids = np.sort( list(cumulative_variable_ids)) df = cumul_val_to_rate(df, cumulative_variable_ids) elif tbl_name == 'observrec': cumulative_variable_ids = set(voi.index[voi.VariableName.apply( lambda x: 'cumul' in x or '/c' in x)]) cumulative_variable_ids &= set(df.VariableID.tolist()) if len(cumulative_variable_ids) == 0: pass # print('Patient', pid, 'have cumulative observrec variables.') else: cnt_pid_urine += 1 cumulative_variable_ids = np.sort( list(cumulative_variable_ids)) df_old = df.copy() df = cumul_val_to_rate(df, cumulative_variable_ids) assert (df.Value.max() != float('Inf')) df = correct_weight_height(df) df_height.append(df[df.VariableID == 10000450].copy()) df.drop(df.index[df.VariableID == 10000450], inplace=True) df = increase_categorical_counter_to_merge(df) df = remove_out_of_range(df, voi, tbl_name, index_chunk) df.drop(df.index[~df.VariableID.isin(vid_intersect)], inplace=True) df.set_index(np.arange(df_idx_start, df_idx_start + len(df)), drop=True, inplace=True) df_idx_start += len(df) if output_to_disk: df.to_hdf(output_path, 'data', append=True, format='table', data_columns=['PatientID', 'VariableID'], complevel=5, complib='blosc:lz4') if (i + 1) % 50 == 0: print('%d / %d' % (i + 1, num_pid)) gc.collect() print(cnt_pid_urine) if tbl_name == 'observrec' and output_to_disk: height_info_path = os.path.join(data_path, 'oor_removed', 'height') if not os.path.exists(height_info_path): os.mkdir(height_info_path) df_height = pd.concat(df_height, axis=0) df_height.reset_index(inplace=True, drop=True) df_height.to_hdf(os.path.join( height_info_path, 'height_%d_%d--%d.h5' % (index_chunk, np.min(pid_list), np.max(pid_list))), 'data', data_columns=['PatientID'], complevel=5, complib='blosc:lz4')