def main(args): train_path = os.path.join(args.subjects_path, 'train') test_path = os.path.join(args.subjects_path, 'test') if not (os.path.exists(train_path) or os.path.exists(test_path)): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), train_path) elif not (os.path.exists(args.plots_path)): os.makedirs(args.plots_path) subject_directories_train = get_subject_dirs(train_path) subject_directories_test = get_subject_dirs(test_path) subject_directories = subject_directories_train + subject_directories_test los_hours, los_remaining_hours, los_targets_coarse, \ los_remaining_targets_coarse, los_targets_fine, \ los_remaining_targets_fine = [], [], [], [], [], [] with open(args.config) as f: config = json.load(f) variables = config['variables'] # Store all data in a single dataframe complete_data_df = pd.DataFrame(columns=variables) # Per subject, store which variables have no values in the time series subject_no_values_df = pd.DataFrame(columns=variables) for i, sd in enumerate(tqdm(subject_directories)): ts = pd.read_csv(os.path.join(sd, 'timeseries.csv')) ts = ts[variables] empty_vars_series = ts.notnull().any() subject_no_values_df = subject_no_values_df.append(empty_vars_series, ignore_index=True) complete_data_df = complete_data_df.append(ts) # Visualize the percentage of missing values per variable for all data ax = missingno.bar(complete_data_df, color=(31 / 256, 119 / 256, 180 / 256)) ax.figure.savefig(os.path.join(args.plots_path, 'missing_data_bar_plot.pdf'), format="pdf", bbox_inches='tight', pad_inches=0) # For each variable, visualize the percentage of subjects that have no # recorded measurement subject_no_values_df = subject_no_values_df.replace(False, np.nan) ax = missingno.bar(subject_no_values_df, color=(31 / 256, 119 / 256, 180 / 256)) ax.figure.savefig(os.path.join(args.plots_path, 'no_variable_recording_per_subject.pdf'), format="pdf", bbox_inches='tight', pad_inches=0)
def main(args): subjects_path = args.subjects_path # Split the data set into training and test data train_dirs, test_dirs = split_data_set(subjects_path, args.train_perc) print(f'There are {len(train_dirs)} train directories ' \ f'and {len(test_dirs)} test directories.') # Create train and test directories move_to_directory(subjects_path, train_dirs, 'train') move_to_directory(subjects_path, test_dirs, 'test') print('...split the training set into training and validation...') train_dirs, val_dirs = split_data_set( os.path.join(subjects_path, 'train'), args.val_perc, bin_size=9) # larger bin size because less data test_dirs = get_subject_dirs(os.path.join(subjects_path, 'test')) print(f'There are {len(train_dirs)} train directories ' \ f'and {len(val_dirs)} validation directories.') train_sub_path = os.path.join(subjects_path, 'training_subjects.txt') val_sub_path = os.path.join(subjects_path, 'validation_subjects.txt') test_sub_path = os.path.join(subjects_path, 'test_subjects.txt') print('...write the training, validation and test subjects to files...') with open(train_sub_path, 'w') as f: f.write('\n'.join(train_dirs)) with open(val_sub_path, 'w') as f: f.write('\n'.join(val_dirs)) with open(test_sub_path, 'w') as f: f.write('\n'.join(test_dirs))
def main(args): train_dirs = get_subject_dirs(args.train_path) config_f = args.config with open(config_f, 'r') as f: config = json.load(f) variables = config['variables'] manager = mp.Manager() q = manager.Queue() pool = mp.Pool() # Create a listener s.t. it is safe to write to the config file watcher = pool.apply_async(listener, ( config, q, )) # Create worker processes jobs = [] for variable in variables: job = pool.apply_async(get_normalization_stats_for_var, (variable, train_dirs, config, q)) jobs.append(job) # Collect results rom the pool result queue for job in jobs: job.get() # Kill the listener once all jobs are done q.put('kill') pool.close() pool.join()
def main(args): with open(args.config) as f: config = json.load(f) normalization_statistics = config['normalization_statistics'] variables = config['variables'] train_dirs = get_subject_dirs(args.train_path) test_dirs = get_subject_dirs(args.test_path) all_dirs = train_dirs + test_dirs with mp.Pool() as pool: for _ in tqdm(pool.istarmap( normalize, zip(all_dirs, repeat(normalization_statistics), repeat(variables))), total=len(all_dirs)): pass
def split_data_set(data_dirs_path, split_perc=20, bin_size=4): """Split the data set into two (x and y) Args: data_dirs_path (str): Path to the data directories val_perc (int): Percentage of data to be reserved for validation bin_size (int): Minimum amount of values per bin Returns: x_dirs (list): List of x-split directories y_dirs (list): List of y-split directories """ data_dirs = get_subject_dirs(data_dirs_path) # Get two arrays: one of targets and one of the # corresponding subjects targets = np.zeros(len(data_dirs)) subjects = np.zeros(len(data_dirs)) for i, sd in enumerate(data_dirs): df_ts = pd.read_csv(os.path.join(sd, 'timeseries.csv')) targets[i] = df_ts.LOS_HOURS.iloc[0] subject_id = [int(s) for s in sd.split('/') if s.isdigit()][-1] subjects[i] = subject_id # Define the bins for splitting sorted_targets = np.sort(targets) bins = [0] set_check = set() for t in np.sort(targets): set_check.add(t) if len(set_check) > bin_size: bins.append(t) set_check = set() bins.append(max(targets) + 1) # Bin the targets targets_binned = np.digitize(targets, bins) # Split the subjects list into a list of x-subjects and a # list of y-subjects, in a stratified manner subjects_y, subjects_x, _, _ = train_test_split(subjects, targets, test_size=split_perc / 100, random_state=42, stratify=targets_binned, shuffle=True) x_dirs = [f'{data_dirs_path}/{int(i)}' for i in subjects_x] y_dirs = [f'{data_dirs_path}/{int(i)}' for i in subjects_y] return x_dirs, y_dirs
def main(args): subjects_path = args.subjects_path with open(args.config) as f: config = json.load(f) variables = config['variables'] subject_dirs = get_subject_dirs(subjects_path) tot_subjects = len(subject_dirs) with mp.Pool() as pool: for _ in tqdm(pool.istarmap(create_timeseries, zip(repeat(variables), subject_dirs)), total=tot_subjects): pass
def main(args): impute_method = args.impute_method with open(args.config) as f: config = json.load(f) imputation_values = config['imputation_values'] print(f'Starting {impute_method} imputing with normal values.' \ f'Binary imputation mask: {args.mask}') subject_dirs = get_subject_dirs(args.subjects_path) with mp.Pool() as pool: for _ in tqdm(pool.istarmap(impute, zip(subject_dirs, repeat(imputation_values), repeat(impute_method), repeat(args.mask))), total=len(subject_dirs)): pass
def main(args): train_path = os.path.join(args.subjects_path, 'train') test_path = os.path.join(args.subjects_path, 'test') if not (os.path.exists(train_path) or os.path.exists(test_path)): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), train_path) elif not (os.path.exists(args.plots_path)): os.makedirs(args.plots_path) subject_directories_train = get_subject_dirs(train_path) subject_directories_test = get_subject_dirs(test_path) subject_directories = subject_directories_train + subject_directories_test los_hours, los_remaining_hours, los_targets_coarse, \ los_remaining_targets_coarse, los_targets_fine, \ los_remaining_targets_fine = [], [], [], [], [], [] for i, sd in enumerate(tqdm(subject_directories)): # Read the timeseries dataframe ts = pd.read_csv(os.path.join(sd, 'timeseries.csv')) # Find the total length of the stay in hours los_hours.append(ts.LOS_HOURS.iloc[0]) # Compute the coarse target bucket for the complete stay los_targets_coarse.append(ts.TARGET_COARSE.iloc[0]) # Compute the fine target bucket for the complete stay los_targets_fine.append(ts.TARGET_FINE.iloc[0]) # Find all the intermediate remaining length of stay in hours los_remaining_hours += ts.LOS_HOURS.to_list() # Obtain the coarse target bucket for each intermediate time-step los_remaining_targets_coarse += ts.TARGET_COARSE.to_list() # Obtain the fine target bucket for each intermediate time-step los_remaining_targets_fine += ts.TARGET_FINE.to_list() # Only keep the 95% percentile of los_hours and los_remaining_hours los_perc = np.percentile(los_hours, 95) los_remaining_perc = np.percentile(los_remaining_hours, 95) los_hours = list(filter(lambda x: x < los_perc, los_hours)) los_remaining_hours = list( filter(lambda x: x < los_remaining_perc, los_remaining_hours)) # X-ticks for the coarse buckets plot xticks_coarse = ['(0, 2)', '(2, 8)', '8+'] # X-ticks for the fine buckets plot xticks_fine = [ '(0, 1)', '(1, 2)', '(2, 3)', '(3, 4)', '(4, 5)', '(5, 6)', '(6, 7)', '(7, 8)', '(8, 14)', '14+' ] # Create the coarse buckets histogram create_histogram( input_data=[los_targets_coarse, los_remaining_targets_coarse], xlabel='Buckets', ylabel='Frequency', rwidth=0.5, legend=['LOS', 'Remaining LOS'], xticks=xticks_coarse, save_plot=(os.path.join( args.plots_path, 'normalized_frequency_of_the_target_buckets_coarse.pdf'))) # Create the fine buckets histogram create_histogram( input_data=[los_targets_fine, los_remaining_targets_fine], xlabel='Buckets', ylabel='Frequency', rwidth=0.5, legend=['LOS', 'Remaining LOS'], xticks=xticks_fine, save_plot=(os.path.join( args.plots_path, 'normalized_frequency_of_the_target_buckets_fine.pdf'))) # Create the LOS hours histogram create_histogram(input_data=[los_hours, los_remaining_hours], xlabel='Hours', ylabel='Frequency', rwidth=1, legend=['LOS', 'Remaining LOS'], save_plot=(os.path.join( args.plots_path, 'normalized_frequency_of_the_LOS_in_hours.pdf')))
def main(args): subjects_path, verbose = args.subjects_path, args.verbose with open(args.config) as f: config = json.load(f) vars_to_itemid = config['vars_to_itemid'] valid_ranges = config['valid_variable_ranges'] subject_dirs = get_subject_dirs(subjects_path) if verbose: print("Filtering and cleaning selected variables...") tot_subjects = len(subject_dirs) removed_subjects, tot_events, tot_events_kept = 0, 0, 0 # Create item_id to var dictionary based on vars_to_itemid item_id_to_vars = {} for var, item_ids in vars_to_itemid.items(): for item_id in item_ids: item_id_to_vars[item_id] = var # Create a list of variables to keep itemids_to_keep = list(item_id_to_vars.keys()) # Create a pandas dataframe based on item_id_to_vars df_item_id = pd.DataFrame(item_id_to_vars.items(), columns=['ITEMID', 'VARIABLE']) # Initialize variable counts dictionary variable_counts = {} for var in vars_to_itemid.keys(): variable_counts[var] = { 'VALUES': [], 'SUBJECTS': 0, 'INVALID_VALUES': 0 } for i, sd in enumerate(tqdm(subject_dirs)): # Read the events dataframe df_events = pd.read_csv(os.path.join(sd, 'events.csv')) tot_events += len(df_events) # Filter the dataframe on the variables that we want to keep df_events = pd.merge(df_events, df_item_id, how='inner', on='ITEMID') df_events = df_events[df_events.VALUE.notnull()] # Clean variables df_events = clean_variables(df_events, cleaning_functions) # Clean charttime -- we know from the format that the length should # always be 19 df_events = df_events[df_events.CHARTTIME.str.len() == 19] # Remove invalid values df_events, variable_counts = remove_invalid_values( df_events, valid_ranges, variable_counts) # Sort on CHARTTIME df_events = df_events.sort_values(by='CHARTTIME') tot_events_kept += len(df_events) # Write df_events to CSV if not df_events.empty: df_events.to_csv(os.path.join(sd, 'events.csv'), index=False) else: remove_subject_dir(os.path.join(sd)) removed_subjects += 1 # Write results to the file with open(args.output_path, 'w') as wf: csv_header = [ 'VARIABLE', 'COUNT', 'SUBJECTS', 'INVALID_VALUES', 'MIN', 'MAX', 'MEAN', 'MEDIAN' ] wf.write(','.join(csv_header) + '\n') csv_writer = csv.DictWriter(wf, fieldnames=csv_header, quoting=csv.QUOTE_MINIMAL) for key, val in variable_counts.items(): results = { 'VARIABLE': key, 'COUNT': len(variable_counts[key]['VALUES']), 'SUBJECTS': variable_counts[key]['SUBJECTS'], 'INVALID_VALUES': variable_counts[key]['INVALID_VALUES'], 'MIN': np.min(variable_counts[key]['VALUES']), 'MAX': np.max(variable_counts[key]['VALUES']), 'MEAN': np.mean(variable_counts[key]['VALUES']), 'MEDIAN': np.median(variable_counts[key]['VALUES']) } csv_writer.writerows([results]) if verbose: print(f'Of the initial {tot_subjects} subjects, ' \ f'{tot_subjects-removed_subjects} remain that have valid ' \ f'variables of interest associated with them.\nOf the ' \ f'initial {tot_events} events, {tot_events_kept} remain ' \ f'which are variables of interest.') total_invalid_values = 0 for key, val in variable_counts.items(): total_invalid_values += variable_counts[key]['INVALID_VALUES'] print(f'The total number of invalid values is: {total_invalid_values}')
def main(args): mimic_iii_path, output_path = args.input_path, args.output_path v_print = print if args.verbose else lambda *a, **k: None if not os.path.exists(output_path): os.makedirs(output_path) reader = MimicNICUReaders(mimic_iii_path, args.verbose) df = reader.read_admissions_table() df_icu = reader.read_icustays_table() df_pat = reader.read_patients_table() df_lab = reader.read_labevents_table() df = df_icu.merge(df, how='inner', on=['SUBJECT_ID', 'HADM_ID']) v_print(f'Filtered NICU admissions -- with admission ' \ f'information: {df.shape[0]}') df = df.merge(df_pat, how='inner', on='SUBJECT_ID') v_print(f'Filtered NICU admissions -- with patient information: ' f'{df.shape[0]}') df = filter_on_newborns(df) v_print(f'Filtered NICU admissions -- newborn only {df.shape[0]}') df = df[df.SUBJECT_ID.isin(df_lab.SUBJECT_ID)] v_print(f'Filtered NICU admissions -- with associated ' \ f'lab events: {df.shape[0]}') df_notes = reader.read_noteevents_table() # Filter df_notes on subjects and admissions in df df_notes = df_notes[df_notes.SUBJECT_ID.isin(df.SUBJECT_ID)] df_notes = df_notes[df_notes.HADM_ID.isin(df.HADM_ID)] # Filter on subjects that have notes associated with them df = df[df.SUBJECT_ID.isin(df_notes.SUBJECT_ID)] v_print(f'Filtered NICU admissions -- with associated ' \ f'notes: {df.shape[0]}') v_print('...extract GA from notes and remove admissions with a capacity ' \ 'related transfer...') df_ga, cap_trans_set = process_notes(df_notes, reg_exps) df = df.merge(df_ga, how='inner', on='SUBJECT_ID') v_print(f'Filtered NICU admissions -- with GA: {df.shape[0]}') # Filter out admissions with capacity-related transfers df = df[~df.SUBJECT_ID.isin(cap_trans_set)] v_print(f'Filtered NICU admissions -- without capacity ' \ f'related transfers: {df.shape[0]}') v_print(f'{df.HOSPITAL_EXPIRE_FLAG.sum()}/{df.shape[0]} newborns in df ' \ 'died during their NICU admission.') v_print('...split admissions by subject...') tot_nb_subjects = len(df.SUBJECT_ID.unique()) # Write admission information to directory per subject subjects_to_keep = set() for i, (ix, row) in enumerate(tqdm(df.iterrows(), total=df.shape[0])): subject_f = os.path.join(output_path, str(row.SUBJECT_ID)) subjects_to_keep.add(row.SUBJECT_ID) if not os.path.exists(subject_f): os.makedirs(subject_f) df.loc[df.SUBJECT_ID == row.SUBJECT_ID].to_csv(os.path.join( subject_f, 'stay.csv'), index=False) # Read and split MIMIC-III event tables per subject # Using multiprocessing to read the tables simultaneously table_names = ['chartevents', 'labevents', 'noteevents'] with mp.Pool() as p: p.starmap( read_and_split_table_by_subject, zip(repeat(mimic_iii_path), table_names, repeat(output_path), repeat(subjects_to_keep), repeat(args.verbose), range(len(table_names)))) # Validate the events and notes subject_directories = get_subject_dirs(output_path) validate_events_and_notes(subject_directories)