def build_cohort(params: HyperParams, df_features: DataFrame, datafile='data/fulldata.npy'): df_cohort = load_labels(params) # df_cohort = load_dataframe('df_cohort') # Join the cohort on the features df_full_data = df_cohort.set_index(['hadm_id']).join(df_features.set_index( ['hadm_id']), how='inner') df_full_data = set_target_feature_name(df_full_data) print(f"cohort dataset: {df_full_data.shape}") write_dataframe(df_full_data, 'df_full_data') df_temp = df_full_data.copy() if 'hadm_id' in df_temp.columns: df_temp = df_temp.drop(columns='hadm_id') # df_full_data = load_dataframe('df_full_data') np_fulldata = df_temp.to_numpy() # Save to a file np.save(datafile, np_fulldata) print(f"cohort data saved to {datafile}") return df_full_data
def load_bacteria_labels(params): df_cohort = cohort.query_esbl_bacteria_label( params.observation_window_hours) df_cohort = df_cohort[['hadm_id', 'resistant_label']] print(f"df_labels: {df_cohort.shape}") write_dataframe(df_cohort, 'df_cohort') return df_cohort
def get_lab_flags(df_lab_events, binning_numerics): df_lab_flags = pivot_flags_to_columns(df_lab_events, binning_numerics) print(f"df_lab_flags: {df_lab_flags.shape}") write_dataframe(df_lab_flags, 'df_lab_flags') # df_lab_flags = load_dataframe('df_lab_flags') # lab_flags_feature_names = df_lab_flags.columns.tolist() return df_lab_flags
def load_labels(params): df_cohort = cohort.query_esbl_pts(params.observation_window_hours) df_cohort = cohort.remove_dups(df_cohort) df_cohort = df_cohort[['hadm_id', 'RESISTANT_YN']] print(f"df_labels: {df_cohort.shape}") write_dataframe(df_cohort, 'df_cohort') return df_cohort
def join_static_and_lab_data(df_lab, df_static_data): df_lab = df_lab.set_index(['hadm_id']) df_static_data = df_static_data.set_index(['hadm_id']) df_dataset_unprocessed = df_lab.join(df_static_data, how='inner') # join on index hadm_id print(f"join_static_and_lab_data: {df_dataset_unprocessed.shape}") write_dataframe(df_dataset_unprocessed, 'join_static_and_lab_data') # df_dataset_unprocessed = load_dataframe('join_static_and_lab_data') return df_dataset_unprocessed
def load_static_features(view_name_all_pts_within_observation_window): df_static_data = create_dataset.static_data(hadm_ids_table=view_name_all_pts_within_observation_window) df_static_data = df_static_data.drop(columns=['admittime']) static_feature_names = df_static_data.columns.tolist() process_static_data(df_static_data) write_dataframe(df_static_data, 'df_static_data') # df_static_data = load_dataframe('df_static_data') # static_feature_names = df_static_data.columns.tolist() return df_static_data
def build_autoencoded_data_matrix(numpy_output_file='autoencoded_fulldata.npy', params = HyperParams()): # 1. build all features dataset, for all 54k admissions df_final_dataset_binned = featues_datasets_all_patients.run(params, binning_numerics=True, create_patients_list_view=True, create_lab_events=True) print(f"Created full features dataset: {df_final_dataset_binned.shape}") io.write_dataframe(df_final_dataset_binned, 'df_final_dataset_binned') df_final_dataset_binned = io.load_dataframe('df_final_dataset_binned') # write AE training data to numpy file ae_training_datafile_name = 'autoencoder_training_data.npy' np_training_datafile = config.DATA_DIR + '/' + ae_training_datafile_name print(f"Writing AutoEncoder training data to {np_training_datafile}") featues_datasets_all_patients.save_auto_encoder_training_data( df_final_dataset_binned, target_datafile = np_training_datafile ) # 2. Train the AutoEncoder encoder_training_epochs = params.encoder_training_epochs dataset = TheDataSet(datafile=np_training_datafile) print(f"dataset length = {len(dataset)} num features = {dataset.num_features()}") from embeddings.autoencoder import Autoencoder from embeddings.train import train, plot_loss model = Autoencoder(num_features=dataset.num_features()) print(model) max_epochs = encoder_training_epochs outputs, losses = train(model, dataset=dataset, num_epochs=max_epochs, batch_size=512, learning_rate=1e-3, denoising=True, denoise_p=0.1) io.write_serialized_model(model, 'autoencoder') print(f"Trained AutoEncoder. Training Data Loss Reached: {losses[-1]} ") plot_loss(losses) model = io.load_serialized_model('autoencoder') # 2. build a labeled cohort np_cohort_data_file = config.DATA_DIR + '/' + 'raw_cohort_data.npy' df_cohort = build_cohort_dataset.build_cohort(params, df_final_dataset_binned, np_cohort_data_file) print(f"Created cohort dataset: {df_cohort.shape}") # 3. Encode the cohort using the trained AutoEncoder device = 'cuda' if torch.cuda.is_available() else 'cpu' cohort_dataset = TheDataSet(datafile=np_cohort_data_file) data_loader = torch.utils.data.DataLoader(cohort_dataset, batch_size=1, shuffle=False) rows=[] for X, y in data_loader: X = X.to(device) y = y.to(device) row = model.encoder(X.float()) row = torch.cat([row.reshape(1,-1),y.reshape(1,-1).float()], dim=1) rows.append(row) encoded_data = torch.cat(rows, dim=0) np_labeled_data = encoded_data.detach().to('cpu').numpy() numpy_output_file = config.DATA_DIR + '/' + numpy_output_file print(f"Writing cohort matrix to {numpy_output_file}") np.save(numpy_output_file, np_labeled_data) print(f"Created cohort matrix: {np_labeled_data.shape}") return np_labeled_data
def load_lab_events(view_name_hadm_ids): df_lab_events = create_dataset.lab_events(view_name_hadm_ids) df_lab_events = df_lab_events.dropna(subset=['value']) df_lab_events['flag'].fillna('False').map({'abnormal': True, 'delta': True, 'False': False}).value_counts() print('lab events before selection: ', df_lab_events.shape) df_lab_events = keep_last_labtest_instance(df_lab_events) print('lab events after selection: ', df_lab_events.shape) write_dataframe(df_lab_events, 'df_lab_events') # df_lab_events = load_dataframe('df_lab_events') return df_lab_events
def one_hot_encode_categorical(df_dataset_unprocessed): categorical_cols = df_dataset_unprocessed.select_dtypes('object').columns.tolist() df_dataset_processed = pd.get_dummies(df_dataset_unprocessed, columns=categorical_cols, dummy_na=True, drop_first=True) df_dataset_processed.fillna(0) print(f"df_dataset_processed: {df_dataset_processed.shape}") write_dataframe(df_dataset_processed, 'df_dataset_processed') # df_dataset_processed = load_dataframe('df_dataset_processed') return df_dataset_processed
def build_normal_dataframe(numpy_output_file='fulldata.npy', dataframe_output_file='df_cohort'): params = HyperParams() # 1. build all features dataset, for all 54k admissions df_final_dataset = featues_datasets_all_patients.run(params, binning_numerics=False, create_patients_list_view=True, create_lab_events=True) print(f"Created full features dataset: {df_final_dataset.shape}") # 2. build a labeled cohort np_datafile = config.DATA_DIR + '/' + numpy_output_file df_cohort = build_cohort_dataset.build_cohort(params, df_final_dataset, np_datafile) print(f"Created cohort dataset: {df_final_dataset.shape}") io.write_dataframe(df_cohort, dataframe_output_file) return df_cohort
def get_lab_results(df_lab_events): df_lab_results = pivot_labtests_to_columns(df_lab_events) fix_lab_results_categories(df_lab_results) df_lab_results = df_lab_results.drop(columns=['50827', '50856', '51100', '51482', '50981']) print(f"shape before dropping sparses {df_lab_results.shape}") df_lab_results = drop_sparse_columns( df_lab_results, columns=df_lab_results.drop(columns=['hadm_id']).columns.tolist(), max_sparsity_to_keep=0.95 ) print(f"shape after dropping sparses {df_lab_results.shape}") numeric, categorical, weird = detect_data_types(df_lab_results.drop(columns=['hadm_id'])) set_numeric_columns(df_lab_results, numeric) print(f"df_lab_results: {df_lab_results.shape}") write_dataframe(df_lab_results, 'df_lab_results') # df_lab_results = load_dataframe('df_lab_results') # lab_results_feature_names = df_lab_results.columns.tolist() return df_lab_results
def build_cohort_bact(params: HyperParams, df_features: DataFrame): df_cohort = load_bacteria_labels(params) # df_cohort = load_dataframe('df_cohort') # Join the cohort on the features df_full_data = df_cohort.set_index(['hadm_id']).join(df_features.set_index( ['hadm_id']), how='inner') df_full_data = set_target_feature_name(df_full_data, 'resistant_label', 'y') print(f"cohort dataset: {df_full_data.shape}") write_dataframe(df_full_data, 'df_full_data') # df_full_data = load_dataframe('df_full_data') np_fulldata = df_full_data.to_numpy() # Save to a file datafile = 'data/fulldata.npy' np.save(datafile, np_fulldata) print(f"cohort data saved to {datafile}") return df_full_data
def run(params :HyperParams, binning_numerics=False, create_patients_list_view=True, create_lab_events=True): """ Build feature datasets for ALL admissions that were still hospitalized by the end of the observation window returns as a data frame, and also persisted as "df_final_dataset" """ # create list of patients, max_observation_window if create_patients_list_view: df_all_pts_within_observation_window, view_name_all_pts_within_observation_window = \ cohort.query_all_pts_within_observation_window(params.observation_window_hours) write_dataframe(df_all_pts_within_observation_window, 'df_all_pts_within_observation_window') else: view_name_all_pts_within_observation_window = f'default.all_pts_{params.observation_window_hours}_hours' df_all_pts_within_observation_window = load_dataframe('df_all_pts_within_observation_window') # generate features for all patients (under observation window) ## Static features df_static_data = load_static_features(view_name_all_pts_within_observation_window) # Antibiotics prescriptions: onehotrx_df = load_antibiotics(view_name_all_pts_within_observation_window) # Previous admissions: admits_df = load_previous_admissions(view_name_all_pts_within_observation_window, params, binning_numerics) # Open Wounds Diagnosis: wounds_df = load_open_wounds(view_name_all_pts_within_observation_window) # Intubation procedures: df_intubation = load_intubation_procedures(view_name_all_pts_within_observation_window) # Note Events: notes = load_notes(view_name_all_pts_within_observation_window) df_antibiotics_history = load_antibiotics_history(notes) # lab events if create_lab_events: df_lab_events = load_lab_events(view_name_all_pts_within_observation_window) else: df_lab_events = load_dataframe('df_lab_events') # lab results df_lab_results = get_lab_results(df_lab_events) df_lab_flags = get_lab_flags(df_lab_events, binning_numerics) # join lab results df_lab = df_lab_results.merge(df_lab_flags, how='left', on=['hadm_id']) # sort columns by lab tests names df_lab = df_lab.set_index('hadm_id').reindex(sorted(df_lab.columns), axis=1).drop(columns=['hadm_id']).reset_index() df_dataset_unprocessed = join_static_and_lab_data(df_lab, df_static_data) if binning_numerics: # numeric values: bin df_dataset_unprocessed = clean_and_bin_numeric_values(df_dataset_unprocessed, params) else: # numeric values: clean and standardize df_dataset_unprocessed = clean_and_standardize_numeric_values(df_dataset_unprocessed) # join on antibiotics, previous admissions and wound df_dataset_processed = df_dataset_unprocessed df_dataset_processed = pd.merge(df_dataset_processed, onehotrx_df, on='hadm_id', how='left') df_dataset_processed = pd.merge(df_dataset_processed, admits_df, on='hadm_id', how='left') df_dataset_processed = pd.merge(df_dataset_processed, wounds_df, on='hadm_id', how='left') df_dataset_processed = pd.merge(df_dataset_processed, df_intubation, on='hadm_id', how='left') df_dataset_processed = pd.merge(df_dataset_processed, df_antibiotics_history, on='hadm_id', how='left') # categorical values: One Hot Encode df_dataset_processed = one_hot_encode_categorical(df_dataset_processed) df_dataset_processed.fillna(0, inplace=True) df_final_dataset = df_dataset_processed print(f"df_final_dataset: {df_final_dataset.shape}") write_dataframe(df_final_dataset, 'df_final_dataset') print(f"dataset data saved as 'df_final_dataset'") # df_final_dataset = load_dataframe('df_final_dataset') save_auto_encoder_training_data(df_final_dataset) return df_final_dataset