def prepare_data(df_static, df_dynamic, static_feature, args): # label assignment (according to imputed SpO2) imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) path_sta_label = 'data/label/static_label.pkl' path_dyn_label = 'data/label/dynamic_label.pkl' label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh, hypoxemia_window=args.hypoxemia_window, prediction_window=args.prediction_window) if os.path.exists(path_sta_label) and os.path.exists(path_dyn_label): static_label = pd.read_pickle(path_sta_label) positive_pids = label_assign.get_positive_pids(static_label) else: print('Assigning labels...') static_label, dynamic_label = label_assign.assign_label( df_static, df_dynamic) static_label.to_pickle(path_sta_label) dynamic_label.to_pickle(path_dyn_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter( df_static=df_static, mode='exclude', include_icd=None, exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) pos_rate = len(set(subgroup_pids) & set(positive_pids)) / len(subgroup_pids) print('Positive rate:', pos_rate) # select features with pid in subgroup as data matrix, and split into training and test set selected_idx = subgroup_pids static_feature = static_feature.drop(columns=[ 'AnesthesiaDuration', 'Airway_1', 'Airway_1_Time', 'Airway_2', 'Airway_2_Time', 'EBL', 'Urine_Output' ]) X = static_feature.iloc[selected_idx, 1:] y = static_label.loc[selected_idx, 'label'] return X, y, pos_rate
def prepare_data(df_static, df_dynamic): '''Prepare Data''' # label assignment (according to imputed SpO2) imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) path_sta_label = 'data/label/static_label_lstm_' + str(args.hypoxemia_window) + '.pkl' path_dyn_label = 'data/label/dynamic_label_lstm_' + str(args.hypoxemia_window) + '.pkl' label_assign = LabelAssignment(hypoxemia_thresh=90, hypoxemia_window=args.hypoxemia_window, prediction_window=5) # print('Assigning labels...') # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic) # static_label.to_pickle(path_sta_label) # dynamic_label.to_pickle(path_dyn_label) static_label = pd.read_pickle(path_sta_label) dynamic_label = pd.read_pickle(path_dyn_label) positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # normalization of data min_max_scaler = preprocessing.MinMaxScaler() data = df_dynamic.iloc[:, 3:].values df_dynamic.iloc[:, 3:] = min_max_scaler.fit_transform(data) # get subgroup pids subgroup_pids = PatientFilter(df_static=df_static, mode='exclude', include_icd=['J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7', 'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8', 'P27.9', 'J44', 'V46.1', 'Z99.1'], # High-risk group exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() # split subgroup pids into training and test pid set pid_train, pid_test, _, _ = train_test_split(static_label.loc[subgroup_pids]['pid'].values, static_label.loc[subgroup_pids]['label'].values, test_size=0.2, random_state=0, stratify=static_label.loc[subgroup_pids]['label'].values) pid_train = sorted(list(pid_train)) pid_test = sorted(list(pid_test)) print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) # select feature rows with pid in subgroup as data matrix print('Training/testing split:', len(pid_train), '/', len(pid_test)) print('Split into training and test set...') is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values # dynamic_label.loc[list(dynamic_label[dynamic_label.if_to_drop == 1].index), 'label'] = 2 selected_idx_train = list(np.where(is_in_train)[0]) selected_idx_test = list(np.where(is_in_test)[0]) timeSeriesTr = df_dynamic.iloc[selected_idx_train, 0:21] labelsTr = static_label.iloc[pid_train][['pid', 'label']] timeSeriesTe = df_dynamic.iloc[selected_idx_test, 0:21] labelsTe = static_label.iloc[pid_test][['pid', 'label']] num_pos = np.sum(labelsTr['label'].values) + np.sum(labelsTe['label'].values) num_all = len(labelsTr) + len(labelsTe) pos_rate = num_pos / num_all return timeSeriesTr, labelsTr, timeSeriesTe, labelsTe, pos_rate, dynamic_label
print("Anesthesia Duration: {:.0f} ({:.0f}, {:.0f})".format(des_sta.loc['AnesthesiaDuration', '50%'], des_sta.loc['AnesthesiaDuration', '25%'], des_sta.loc['AnesthesiaDuration', '75%'])) raw = pd.read_csv('../data/raw_data/static_updated.csv') static = pd.read_csv('../data/data_frame/static_dataframe.csv') dynamic = pd.read_csv('../data/data_frame/dynamic_dataframe.csv') static['BMI'] = (static['WEIGHT'] / 1000) / (static['HEIGHT'] / 100) / (static['HEIGHT'] / 100) print('Assigning labels...') imputer = DataImputation() df_static = imputer.impute_static_dataframe(static) df_dynamic = imputer.impute_dynamic_dataframe(dynamic) label_assign = LabelAssignment(hypoxemia_thresh=90, hypoxemia_window=10, prediction_window=5) static_label = pd.read_pickle('../data/label/static_label.pkl') dynamic_label = pd.read_pickle('../data/label/dynamic_label.pkl') positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter(df_static=df_static, mode='exclude', include_icd=None, exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() static = pd.read_csv('../data/data_frame/static_dataframe.csv') static['BMI'] = (static['WEIGHT'] / 1000) / (static['HEIGHT'] / 100) / (static['HEIGHT'] / 100) dynamic = pd.read_csv('../data/data_frame/dynamic_dataframe.csv')
def prepare_data(df_static, df_dynamic, dynamic_feature, args): # label assignment (according to imputed SpO2) print('Assigning labels...') imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) path_sta_label = 'data/result/static_label.pkl' path_dyn_label = 'data/result/dynamic_label.pkl' label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh, hypoxemia_window=args.hypoxemia_window, prediction_window=args.prediction_window) # print('Assigning labels...') # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic) # static_label.to_pickle(path_sta_label) # dynamic_label.to_pickle(path_dyn_label) static_label = pd.read_pickle(path_sta_label) dynamic_label = pd.read_pickle(path_dyn_label) positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter( df_static=df_static, mode=args.filter_mode, include_icd=[ 'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7', 'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8', 'P27.9', 'J44', 'V46.1', 'Z99.1' ], # High-risk group exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() # split subgroup pids into training and test pid set pid_train, pid_test, _, _ = train_test_split( static_label.loc[subgroup_pids]['pid'].values, static_label.loc[subgroup_pids]['label'].values, test_size=0.1, random_state=args.random_state, stratify=static_label.loc[subgroup_pids]['label'].values) pid_train = sorted(list(pid_train)) pid_test = sorted(list(pid_test)) print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) del df_static, df_dynamic # select feature rows with pid in subgroup as data matrix print('Training/testing split:', len(pid_train), '/', len(pid_test)) print('Split into training and test set...') to_keep = (dynamic_label['if_to_drop'] == 0).values is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values selected_idx_train = list(np.where(to_keep & is_in_train)[0]) selected_idx_test = list(np.where(to_keep & is_in_test)[0]) # adjust features used dynamic_feature = dynamic_feature.drop( columns=['AnesthesiaDuration', 'EBL', 'Urine_Output']) # column_names = list(dynamic_feature.columns) # drop_list = [] # for name in column_names: # if 'FiO2' in name or 'coreTemp' in name: # drop_list.append(name) # dynamic_feature.drop(columns=drop_list) # split into training and test set X_train = dynamic_feature.iloc[selected_idx_train, 2:] X_test = dynamic_feature.iloc[selected_idx_test, 2:] y_train = dynamic_label.loc[selected_idx_train, 'label'] y_test = dynamic_label.loc[selected_idx_test, 'label'] # shuffle X and y X_train, y_train = shuffle(X_train, y_train, random_state=0) # positive number num_pos = np.sum(y_train) + np.sum(y_test) num_all = len(selected_idx_train) + len(selected_idx_test) pos_rate = num_pos / num_all print('Positive samples:', num_pos, '/', num_all) print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%') return X_train, X_test, y_train, y_test, pos_rate
import matplotlib.pyplot as plt import argparse import pickle import sys df_static = pd.read_csv(config.get('processed', 'df_static_file')) df_dynamic = pd.read_csv(config.get('processed', 'df_dynamic_file')) INDs = [] for hypoxemia_window in np.linspace(1, 10, 10).astype(int): print(hypoxemia_window) imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) label_assign = LabelAssignment(hypoxemia_thresh=90, hypoxemia_window=hypoxemia_window, prediction_window=5) static_label, dynamic_label = label_assign.assign_label( df_static, df_dynamic) labels = dynamic_label['label'].values INDs.append(list(np.where(labels == 1))) # initialize arguments parser = argparse.ArgumentParser(description='hypoxemia prediction') parser.add_argument('--hypoxemia_thresh', type=int, default=90) parser.add_argument('--hypoxemia_window', type=int, default=1) parser.add_argument('--prediction_window', type=int, default=5) parser.add_argument('--filter_mode', type=str, default='exclude') parser.add_argument('--dynamic_feature_file', type=str,
def prepare_data(df_static, df_dynamic, dynamic_feature, args): # label assignment (according to imputed SpO2) print('Assigning labels...') imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh, hypoxemia_window=args.hypoxemia_window, prediction_window=args.prediction_window) static_label, dynamic_label = label_assign.assign_multi_label( df_static, df_dynamic) positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter( df_static=df_static, mode=args.filter_mode, include_icd=[ 'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7', 'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8', 'P27.9', 'J44', 'V46.1', 'Z99.1' ], # High-risk group exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() # split subgroup pids into training and test pid set pid_train, pid_test, _, _ = train_test_split( static_label.loc[subgroup_pids]['pid'].values, static_label.loc[subgroup_pids]['label'].values, test_size=0.1, random_state=0, stratify=static_label.loc[subgroup_pids]['label'].values) pid_train = sorted(list(pid_train)) pid_test = sorted(list(pid_test)) print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) del df_static, df_dynamic # select feature rows with pid in subgroup as data matrix print('Training/testing split:', len(pid_train), '/', len(pid_test)) print('Split into training and test set...') to_keep = (dynamic_label['if_to_drop'] == 0).values is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values selected_idx_train = list(np.where(to_keep & is_in_train)[0]) selected_idx_test = list(np.where(to_keep & is_in_test)[0]) # split into training and test set X_train = dynamic_feature.iloc[selected_idx_train, 2:].values X_test = dynamic_feature.iloc[selected_idx_test, 2:].values y_train = dynamic_label.loc[selected_idx_train, 'label'].values y_test = dynamic_label.loc[selected_idx_test, 'label'].values # shuffle X and y X_train, y_train = shuffle( X_train, y_train, # random_state=0 ) # positive number num_pos = np.sum(y_train) + np.sum(y_test) num_all = len(selected_idx_train) + len(selected_idx_test) print('Positive samples:', num_pos, '/', num_all) print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%') return X_train, X_test, y_train, y_test
df = df_raw[df_raw['Airway_Event1'] == airway_list[0]][0:10] for airway in airway_list: if airway == airway_list[0]: continue df = df.append(df_raw[df_raw['Airway_Event1'] == airway][0:10]) index = df.index df = df.reindex(columns=['Airway_Event1'] + df.columns.to_list()) df.to_csv('../data/results/airway_samples.csv') # label assignment (according to imputed SpO2) print('Assigning labels...') imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) label_assign = LabelAssignment(hypoxemia_thresh=90, hypoxemia_window=10, prediction_window=5) static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic) positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter(df_static=df_static, mode='exclude', include_icd=None, exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids))