def prepare_data(df_static, df_dynamic, static_feature, args): # label assignment (according to imputed SpO2) imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) path_sta_label = 'data/label/static_label.pkl' path_dyn_label = 'data/label/dynamic_label.pkl' label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh, hypoxemia_window=args.hypoxemia_window, prediction_window=args.prediction_window) if os.path.exists(path_sta_label) and os.path.exists(path_dyn_label): static_label = pd.read_pickle(path_sta_label) positive_pids = label_assign.get_positive_pids(static_label) else: print('Assigning labels...') static_label, dynamic_label = label_assign.assign_label( df_static, df_dynamic) static_label.to_pickle(path_sta_label) dynamic_label.to_pickle(path_dyn_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter( df_static=df_static, mode='exclude', include_icd=None, exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) pos_rate = len(set(subgroup_pids) & set(positive_pids)) / len(subgroup_pids) print('Positive rate:', pos_rate) # select features with pid in subgroup as data matrix, and split into training and test set selected_idx = subgroup_pids static_feature = static_feature.drop(columns=[ 'AnesthesiaDuration', 'Airway_1', 'Airway_1_Time', 'Airway_2', 'Airway_2_Time', 'EBL', 'Urine_Output' ]) X = static_feature.iloc[selected_idx, 1:] y = static_label.loc[selected_idx, 'label'] return X, y, pos_rate
def prepare_data(df_static, df_dynamic): '''Prepare Data''' # label assignment (according to imputed SpO2) imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) path_sta_label = 'data/label/static_label_lstm_' + str(args.hypoxemia_window) + '.pkl' path_dyn_label = 'data/label/dynamic_label_lstm_' + str(args.hypoxemia_window) + '.pkl' label_assign = LabelAssignment(hypoxemia_thresh=90, hypoxemia_window=args.hypoxemia_window, prediction_window=5) # print('Assigning labels...') # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic) # static_label.to_pickle(path_sta_label) # dynamic_label.to_pickle(path_dyn_label) static_label = pd.read_pickle(path_sta_label) dynamic_label = pd.read_pickle(path_dyn_label) positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # normalization of data min_max_scaler = preprocessing.MinMaxScaler() data = df_dynamic.iloc[:, 3:].values df_dynamic.iloc[:, 3:] = min_max_scaler.fit_transform(data) # get subgroup pids subgroup_pids = PatientFilter(df_static=df_static, mode='exclude', include_icd=['J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7', 'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8', 'P27.9', 'J44', 'V46.1', 'Z99.1'], # High-risk group exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() # split subgroup pids into training and test pid set pid_train, pid_test, _, _ = train_test_split(static_label.loc[subgroup_pids]['pid'].values, static_label.loc[subgroup_pids]['label'].values, test_size=0.2, random_state=0, stratify=static_label.loc[subgroup_pids]['label'].values) pid_train = sorted(list(pid_train)) pid_test = sorted(list(pid_test)) print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) # select feature rows with pid in subgroup as data matrix print('Training/testing split:', len(pid_train), '/', len(pid_test)) print('Split into training and test set...') is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values # dynamic_label.loc[list(dynamic_label[dynamic_label.if_to_drop == 1].index), 'label'] = 2 selected_idx_train = list(np.where(is_in_train)[0]) selected_idx_test = list(np.where(is_in_test)[0]) timeSeriesTr = df_dynamic.iloc[selected_idx_train, 0:21] labelsTr = static_label.iloc[pid_train][['pid', 'label']] timeSeriesTe = df_dynamic.iloc[selected_idx_test, 0:21] labelsTe = static_label.iloc[pid_test][['pid', 'label']] num_pos = np.sum(labelsTr['label'].values) + np.sum(labelsTe['label'].values) num_all = len(labelsTr) + len(labelsTe) pos_rate = num_pos / num_all return timeSeriesTr, labelsTr, timeSeriesTe, labelsTe, pos_rate, dynamic_label
des_sta.loc['Urine_Output', '25%'], des_sta.loc['Urine_Output', '75%'])) # Anesthesia Duration print("Anesthesia Duration: {:.0f} ({:.0f}, {:.0f})".format(des_sta.loc['AnesthesiaDuration', '50%'], des_sta.loc['AnesthesiaDuration', '25%'], des_sta.loc['AnesthesiaDuration', '75%'])) raw = pd.read_csv('../data/raw_data/static_updated.csv') static = pd.read_csv('../data/data_frame/static_dataframe.csv') dynamic = pd.read_csv('../data/data_frame/dynamic_dataframe.csv') static['BMI'] = (static['WEIGHT'] / 1000) / (static['HEIGHT'] / 100) / (static['HEIGHT'] / 100) print('Assigning labels...') imputer = DataImputation() df_static = imputer.impute_static_dataframe(static) df_dynamic = imputer.impute_dynamic_dataframe(dynamic) label_assign = LabelAssignment(hypoxemia_thresh=90, hypoxemia_window=10, prediction_window=5) static_label = pd.read_pickle('../data/label/static_label.pkl') dynamic_label = pd.read_pickle('../data/label/dynamic_label.pkl') positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter(df_static=df_static, mode='exclude', include_icd=None, exclude_icd9=['745', '746', '747'],
def prepare_data(df_static, df_dynamic, dynamic_feature, args): # label assignment (according to imputed SpO2) print('Assigning labels...') imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) path_sta_label = 'data/result/static_label.pkl' path_dyn_label = 'data/result/dynamic_label.pkl' label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh, hypoxemia_window=args.hypoxemia_window, prediction_window=args.prediction_window) # print('Assigning labels...') # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic) # static_label.to_pickle(path_sta_label) # dynamic_label.to_pickle(path_dyn_label) static_label = pd.read_pickle(path_sta_label) dynamic_label = pd.read_pickle(path_dyn_label) positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter( df_static=df_static, mode=args.filter_mode, include_icd=[ 'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7', 'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8', 'P27.9', 'J44', 'V46.1', 'Z99.1' ], # High-risk group exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() # split subgroup pids into training and test pid set pid_train, pid_test, _, _ = train_test_split( static_label.loc[subgroup_pids]['pid'].values, static_label.loc[subgroup_pids]['label'].values, test_size=0.1, random_state=args.random_state, stratify=static_label.loc[subgroup_pids]['label'].values) pid_train = sorted(list(pid_train)) pid_test = sorted(list(pid_test)) print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) del df_static, df_dynamic # select feature rows with pid in subgroup as data matrix print('Training/testing split:', len(pid_train), '/', len(pid_test)) print('Split into training and test set...') to_keep = (dynamic_label['if_to_drop'] == 0).values is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values selected_idx_train = list(np.where(to_keep & is_in_train)[0]) selected_idx_test = list(np.where(to_keep & is_in_test)[0]) # adjust features used dynamic_feature = dynamic_feature.drop( columns=['AnesthesiaDuration', 'EBL', 'Urine_Output']) # column_names = list(dynamic_feature.columns) # drop_list = [] # for name in column_names: # if 'FiO2' in name or 'coreTemp' in name: # drop_list.append(name) # dynamic_feature.drop(columns=drop_list) # split into training and test set X_train = dynamic_feature.iloc[selected_idx_train, 2:] X_test = dynamic_feature.iloc[selected_idx_test, 2:] y_train = dynamic_label.loc[selected_idx_train, 'label'] y_test = dynamic_label.loc[selected_idx_test, 'label'] # shuffle X and y X_train, y_train = shuffle(X_train, y_train, random_state=0) # positive number num_pos = np.sum(y_train) + np.sum(y_test) num_all = len(selected_idx_train) + len(selected_idx_test) pos_rate = num_pos / num_all print('Positive samples:', num_pos, '/', num_all) print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%') return X_train, X_test, y_train, y_test, pos_rate
def prepare_data(df_static, df_dynamic, dynamic_feature, args): # label assignment (according to imputed SpO2) print('Assigning labels...') imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic) label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh, hypoxemia_window=args.hypoxemia_window, prediction_window=args.prediction_window) static_label, dynamic_label = label_assign.assign_multi_label( df_static, df_dynamic) positive_pids = label_assign.get_positive_pids(static_label) print('Done.') # get subgroup pids subgroup_pids = PatientFilter( df_static=df_static, mode=args.filter_mode, include_icd=[ 'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7', 'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8', 'P27.9', 'J44', 'V46.1', 'Z99.1' ], # High-risk group exclude_icd9=['745', '746', '747'], exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd() # split subgroup pids into training and test pid set pid_train, pid_test, _, _ = train_test_split( static_label.loc[subgroup_pids]['pid'].values, static_label.loc[subgroup_pids]['label'].values, test_size=0.1, random_state=0, stratify=static_label.loc[subgroup_pids]['label'].values) pid_train = sorted(list(pid_train)) pid_test = sorted(list(pid_test)) print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids)) print('Before trimming:', len(positive_pids), '/', len(df_static)) print('Trimmed cases:', len(df_static) - len(subgroup_pids)) del df_static, df_dynamic # select feature rows with pid in subgroup as data matrix print('Training/testing split:', len(pid_train), '/', len(pid_test)) print('Split into training and test set...') to_keep = (dynamic_label['if_to_drop'] == 0).values is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values selected_idx_train = list(np.where(to_keep & is_in_train)[0]) selected_idx_test = list(np.where(to_keep & is_in_test)[0]) # split into training and test set X_train = dynamic_feature.iloc[selected_idx_train, 2:].values X_test = dynamic_feature.iloc[selected_idx_test, 2:].values y_train = dynamic_label.loc[selected_idx_train, 'label'].values y_test = dynamic_label.loc[selected_idx_test, 'label'].values # shuffle X and y X_train, y_train = shuffle( X_train, y_train, # random_state=0 ) # positive number num_pos = np.sum(y_train) + np.sum(y_test) num_all = len(selected_idx_train) + len(selected_idx_test) print('Positive samples:', num_pos, '/', num_all) print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%') return X_train, X_test, y_train, y_test
parser.add_argument('--if_impute', type=str, default='True') # 'True' OR 'False' parser.add_argument('--static_txt', type=str, default='rbow') # 'bow' OR 'rbow' parser.add_argument('--dynamic_txt', type=str, default='notxt') # 'notxt' OR 'rbow' args = parser.parse_args() print(args) # path df_static_file = config.get('processed', 'df_static_file') df_dynamic_file = config.get('processed', 'df_dynamic_file') # save name token_impute = 'imp' if args.if_impute == 'True' else 'nonimp' static_feature_file = 'data/features/static-' + args.static_txt + '.csv' ewm_feat_file = 'data/features/dynamic-ewm-' + args.dynamic_txt + '-' + token_impute + '.csv' sta_feat_file = 'data/features/dynamic-sta-' + args.dynamic_txt + '-' + token_impute + '.csv' lstm_feat_file = 'data/features/dynamic-lstm-' + args.dynamic_txt + '-' + token_impute + '.csv' # load DataFrame real-time data df_static = pd.read_csv(df_static_file) df_dynamic = pd.read_csv(df_dynamic_file) # feature extraction imputer = DataImputation() df_static = imputer.impute_static_dataframe(df_static) df_dynamic = imputer.impute_dynamic_dataframe( df_dynamic) if args.if_impute == 'True' else df_dynamic feature_extraction(df_static, df_dynamic, type=args.type)