def fix_occurrences(df): col_dict = generate_suffix_dict(df) df.loc[:, col_dict['occ']] = ( df[col_dict['occ']] .fillna(0).astype(bool).astype(int) ) return df
def scale(train, val, test): """ SCALING PROTOCOL All TSL are MinMaxScaled EMS - StandardScaled, not LogNormal because outlier events are truly outliers EMA - StandardScaled TSL - LogScaled -> StandardScaled or MinMax """ train, val, test = train.copy(), val.copy(), test.copy() pref_dict = generate_prefix_dict(train) suff_dict = generate_suffix_dict(train) curr_ios = list(set(pref_dict['curr']) & (set(suff_dict['io']) | set(suff_dict['occ']))) curr_nums = list(set(pref_dict['curr']) & (set(suff_dict['vitals']) | set(suff_dict['labs']))) stand_cols = pref_dict['ems'] + pref_dict['ema'] + ['age_enc'] + curr_nums minmax_cols = pref_dict['tsl'] + curr_ios + ['time_of_day_enc'] scaler = StandardScaler() train.loc[:,stand_cols] = scaler.fit_transform(train.loc[:,stand_cols]) val.loc[:,stand_cols] = scaler.fit_transform(val.loc[:,stand_cols]) test.loc[:,stand_cols] = scaler.fit_transform(test.loc[:,stand_cols]) minmax = MinMaxScaler() train.loc[:,minmax_cols] = minmax.fit_transform(train.loc[:,minmax_cols]) val.loc[:,minmax_cols] = minmax.fit_transform(val.loc[:,minmax_cols]) test.loc[:,minmax_cols] = minmax.fit_transform(test.loc[:,minmax_cols]) return train.round(5), val.round(5), test.round(5)
def ffill_curr(df): df = df.copy() pref_dict = generate_prefix_dict(df) suff_dict = generate_suffix_dict(df) df.loc[:,pref_dict['curr']] = ( df[pref_dict['curr']] .fillna(method='ffill')) df.loc[:,suff_dict['img']] = ( df[suff_dict['img']] .fillna(method='ffill', limit=6)) return df
def generate_fold_datasets(x, y, fold_dict, train_means_dict, scalers_dict): """ Usage: fold_dataset_generator = generate_fold_datasets( x, y, fold_dict, train_means_dict, scalers_dict) for fold_split in fold_dataset_generator: x_train, x_test, y_train, y_test = fold_split # Do something Or if only want one fold: fold_dataset_generator = generate_fold_datasets( x, y, fold_dict, train_means_dict, scalers_dict) x_train, x_test, y_train, y_test = next(fold_dataset_generator) # Do something """ print('\n>>> Applying preprocessing...') suff_dict = generate_suffix_dict(x) pref_dict = generate_prefix_dict(x) for i, (fold_num, mrns) in enumerate(fold_dict.items()): print('Fold {}'.format(i)) start = time.time() xf, yf = x.copy(), y.copy() # Fill NA train_means, scalers = train_means_dict[i], scalers_dict[i] xf.loc[:, suff_dict['img']] = xf.loc[:, suff_dict['img']].fillna(0) xf = xf.fillna(value=train_means.to_dict()) mid = time.time() print('Filled NA in {}s'.format(round(mid - start, 1))) # Apply scaler objects curr_ios = list( set(pref_dict['curr']) & (set(suff_dict['io']) | set(suff_dict['occ']))) curr_nums = list( set(pref_dict['curr']) & (set(suff_dict['vitals']) | set(suff_dict['labs']))) stand_cols = pref_dict['ema'] + ['age_enc'] + curr_nums minmax_cols = curr_ios + ['hsa_enc'] #, 'time_of_day_enc', 'duke_loc_enc', # 'past_sbo_enc', 'raleigh_loc_enc', 'regional_loc_enc', # 'hsa_enc'] + suff_dict['img'] robust_cols = pref_dict['tsl'] + pref_dict['ems'] standard, minmax, robust = scalers xf.loc[:, stand_cols] = standard.transform(xf[stand_cols]) xf.loc[:, minmax_cols] = minmax.transform(xf[minmax_cols]) xf.loc[:, robust_cols] = robust.transform(xf[robust_cols]) x_train, y_train = xf.drop(mrns), yf.drop(mrns) x_test, y_test = xf.loc[mrns], yf.loc[mrns] print('Scaled in {}s'.format(round(time.time() - mid, 1))) yield x_train, x_test, y_train, y_test
def apply_postsum_transforms(df): """ TRANSFORMATIONS - take log+1 of IO's - take log+1 of ems - take log+1 of tsl """ pref_dict = generate_prefix_dict(df) suff_dict = generate_suffix_dict(df) log_plus_one = lambda s: np.log(s+1) curr_ios = list(set(pref_dict['curr']) & set(suff_dict['io'])) df.loc[:, curr_ios] = df[curr_ios].apply(log_plus_one) df.loc[:, pref_dict['ems']] = df[pref_dict['ems']].apply(log_plus_one) df.loc[:, pref_dict['tsl']] = df[pref_dict['tsl']].apply(log_plus_one) return df
def cache_preprocessing_info(x, fold_dict): print('\n>>> Caching preprocessing...') suff_dict = generate_suffix_dict(x) pref_dict = generate_prefix_dict(x) train_means_dict = dict() scalers_dict = dict() for i, (fold_num, mrns) in enumerate(fold_dict.items()): print('Fold {}'.format(i)) start = time.time() x_train = x.copy().drop(mrns) # Calculate train means x_train.loc[:, suff_dict['img']] = x_train.loc[:, suff_dict['img']].fillna(0) train_means = x_train.mean(axis=0) x_train = x_train.fillna(value=train_means.to_dict()) # Fit Scaler objects standard = StandardScaler() minmax = MinMaxScaler() robust = RobustScaler() curr_ios = list( set(pref_dict['curr']) & (set(suff_dict['io']) | set(suff_dict['occ']))) curr_nums = list( set(pref_dict['curr']) & (set(suff_dict['vitals']) | set(suff_dict['labs']))) stand_cols = pref_dict['ema'] + ['age_enc'] + curr_nums minmax_cols = curr_ios + ['hsa_enc'] #, 'time_of_day_enc', 'duke_loc_enc', # 'past_sbo_enc', 'raleigh_loc_enc', 'regional_loc_enc', # 'hsa_enc'] + suff_dict['img'] robust_cols = pref_dict['tsl'] + pref_dict['ems'] standard.fit(x_train[stand_cols]) minmax.fit(x_train[minmax_cols]) robust.fit(x_train[robust_cols]) train_means_dict[i] = train_means scalers_dict[i] = [standard, minmax, robust] print('Finished in {}s'.format(round(time.time() - start, 1))) return train_means_dict, scalers_dict
def scale(train, test): train, test = train.copy(), test.copy() pref_dict = generate_prefix_dict(train) suff_dict = generate_suffix_dict(train) #curr_ios = list(set(pref_dict['curr']) & # (set(suff_dict['io']) | set(suff_dict['occ']))) #curr_nums = list(set(pref_dict['curr']) & # (set(suff_dict['vitals']) | set(suff_dict['labs']))) #stand_cols = pref_dict['ema'] + ['age_enc'] + curr_nums #minmax_cols = pref_dict['tsl'] + curr_ios + pref_dict['ems'] curr_ios = list( set(pref_dict['curr']) & (set(suff_dict['io']) | set(suff_dict['occ']))) curr_nums = list( set(pref_dict['curr']) & (set(suff_dict['vitals']) | set(suff_dict['labs']))) stand_cols = ( pref_dict['ema'] + ['age_enc'] + curr_nums #+ ['word_log_ratio_img'] ) minmax_cols = curr_ios + ['time_of_day_enc'] + ['hsa_enc'] robust_cols = pref_dict['tsl'] + pref_dict['ems'] scaler = StandardScaler() train.loc[:, stand_cols] = scaler.fit_transform(train.loc[:, stand_cols]) test.loc[:, stand_cols] = scaler.transform(test.loc[:, stand_cols]) minmax = MinMaxScaler() train.loc[:, minmax_cols] = 2 * minmax.fit_transform( train.loc[:, minmax_cols]) - 1 test.loc[:, minmax_cols] = 2 * minmax.transform(test.loc[:, minmax_cols]) - 1 robust = RobustScaler() train.loc[:, robust_cols] = 2 * robust.fit_transform( train.loc[:, robust_cols]) - 1 test.loc[:, robust_cols] = 2 * robust.transform(test.loc[:, robust_cols]) - 1 return train, test
# In[36]: # In[3]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') from preprocessing_exp_weights import preprocess_exp_weights (x, y, x_cols) = preprocess_exp_weights(rebuild=False, time_to_event=True, scale_feat=False, fill_null=False, custom_tag='noimg') #img2 suff_dict = generate_suffix_dict(x) # In[4]: #x = x.drop(list(set(suff_dict['img'])-{'ind12_word_log_ratio_img','ind48_word_log_ratio_img'}), 1) pref_dict = generate_prefix_dict(x) suff_dict = generate_suffix_dict(x) mid_dict = generate_midfix_dict(x) #mid_dict['bp'] = mid_dict['bp_sys'] + mid_dict['bp_dia'] # In[5]: pref_dict.keys() suff_dict.keys() mid_dict.keys()
def standardize_times(df): start = time() df['min_datetime'] = ( df.datetime .groupby(level=[0,1]) .transform(lambda x: x.min()) ) to_hour = lambda td: td.total_seconds() // 3600 df['hour_since_adm'] = ( (df.datetime - df.min_datetime) .transform(to_hour) .astype(int) ) df['hsa_enc'] = df['hour_since_adm'] nonsurg_cutoff = 21*24 df['max_datetime_hour'] = ( df.hour_since_adm .groupby(level=[0,1]) .transform(lambda x: min(nonsurg_cutoff, x.max())) ) df['event_hour_enc'] = ( (df.surg_datetime_enc - df.min_datetime) .transform(to_hour) .fillna(df['max_datetime_hour']) ) df['time_to_event_enc'] = ( df.event_hour_enc - df.hour_since_adm - 1 ) df['time_of_day_enc'] = ( (df.hour_since_adm + df.min_datetime.apply(lambda dt: dt.hour)) % 24 ) df = filter_populations(df, surg_cutoff=21*24) # Filter out measurements after surgery df = df[df.hour_since_adm < df.event_hour_enc] df = df.drop(['datetime', 'min_datetime', 'max_datetime_hour', 'surg_datetime_enc', 'adm_datetime_enc', 'event_hour_enc'], 1) col_dict = generate_suffix_dict(df) mean_columns = ( df.reset_index() .loc[:, col_dict['vitals'] + col_dict['labs'] + col_dict['img'] + ['mrn', 'id','hour_since_adm']] .groupby(['mrn', 'id','hour_since_adm']) .agg('mean') ) sum_columns = ( df.reset_index().loc[:, col_dict['io'] + col_dict['occ'] + ['mrn', 'id','hour_since_adm']] .groupby(['mrn', 'id','hour_since_adm']) .agg('sum') ) enc = ( df.reset_index()[col_dict['enc'] + ['mrn', 'id','hour_since_adm']] .groupby(['mrn', 'id','hour_since_adm']) .agg('max') ) print('Finished standardizing times in {}s'.format(round(time()-start))) res = pd.concat([enc, mean_columns, sum_columns], axis=1) return res
def summary_stats(df): col_dict = generate_suffix_dict(df) max_hour = df.reset_index().hour_since_adm.max() df_copy = df.copy() curr = df_copy[col_dict['vitals'] + col_dict['labs'] + col_dict['io'] + col_dict['occ']].add_prefix('curr_') enc = df_copy[col_dict['enc']] df = df.reset_index(level=[0,1], drop=True).reindex(np.arange(max_hour + 1)) # --- Optimized tsl --- io_df = df[col_dict['io'] + col_dict['occ']] num_df = df[col_dict['vitals'] + col_dict['labs']] io_nan_mask = (io_df.fillna(0) == 0).astype(int).values num_nan_mask = num_df.isna().astype(int).values io_tsl_arr = np.zeros((io_df.shape[0], io_df.shape[1])) num_tsl_arr = np.zeros((num_df.shape[0], num_df.shape[1])) for i in range(io_df.shape[0]): io_tsl_arr[i,:] = (1 + io_tsl_arr[i-1,:])*io_nan_mask[i,:] num_tsl_arr[i,:] = (1 + num_tsl_arr[i-1,:])*num_nan_mask[i,:] io_df.loc[:,:] = io_tsl_arr num_df.loc[:,:] = num_tsl_arr # --- ems = pd.concat( (expsum(df[col_dict['io'] + col_dict['occ']].fillna(0), zerolifes = [z]) for z in [6,24,72]), axis=1) # Take diff between curr and ema ema_vitals = pd.concat( (df[col_dict['vitals']].values - df[col_dict['vitals']].ewm(halflife=halflife).mean().add_prefix( 'ema{}_'.format(halflife)) for halflife in [6, 24, 72]), axis=1) ema_labs = pd.concat( (df[col_dict['labs']].values - df[col_dict['labs']].ewm(halflife=halflife).mean().add_prefix( 'ema{}_'.format(halflife)) for halflife in [12, 48, 144]), axis=1) ffill_img = pd.concat( (df[col_dict['img']].fillna(method='ffill', limit=lim).add_prefix( 'ind{}_'.format(lim)) for lim in [24,504]), axis=1).fillna(0) merged_df = pd.merge( pd.concat([enc, curr],axis=1), #curr, pd.concat([ema_vitals, ema_labs, ems, num_df.add_prefix('tsl_'), io_df.add_prefix('tsl_'), ffill_img ], axis=1), left_index=True, right_index=True, how='left') return merged_df
def summary_stats(df): """ Calculate summary stats for each measurement of patients. """ col_dict = generate_suffix_dict(df) #df.iloc[:2,:] = df.iloc[:2,:].fillna(0) enc = df[col_dict['enc']].tail(1).squeeze() # Custom agg functions def slope_numeric(s): s = s.reset_index(level=[0, 1], drop=True) slope, r_value, std_err = linear_model(s) return slope def slope_rolling_sum(s): s = s.reset_index(level=[0, 1], drop=True).reindex(range(48)) s_rolling = s.rolling(6, min_periods=0).sum() #TODO: change to linreg #slope = linear_model(s_rolling) slope, r_value, std_err = linear_model(s_rolling) return slope def last(s): s = s.dropna() if len(s) == 0: return np.nan return s.iloc[-1] def tsl(s): s = s.dropna() if len(s) == 0: return 48 return 48 - max(s.reset_index().hour_since_adm) def tsl_occ(s): s = s.reset_index(level=[0, 1], drop=True).dropna() if len(s) == 0: return 48 nonzero = s[s > 0].index if len(nonzero) == 0: return 48 if max(nonzero) == max(nonzero): return 48 - max(nonzero) else: return 48 numeric_stats = [ last, tsl, slope_numeric, 'mean', 'std', 'min', 'max', 'skew' ] io_stats = [last, tsl_occ, slope_rolling_sum, 'sum'] occ_stats = [tsl_occ, slope_rolling_sum, 'sum'] numeric_summ = group_stats(df[col_dict['vitals'] + col_dict['labs']], numeric_stats) io_summ = group_stats(df[col_dict['io']], io_stats) occ_summ = group_stats(df[col_dict['occ']], occ_stats) return pd.concat([enc, numeric_summ, io_summ, occ_summ])
# In[132]: enc[enc.sbo_poa].reset_index().mrn.nunique() # In[133]: 454 / 3405 # # VARIABLE missingness # In[244]: sbo_presum = pd.read_pickle( 'data/processed/sbo_exp_presumm_nan_ffill_unscaled_noimg.pickle') suff_dict = generate_suffix_dict(sbo_presum) # In[45]: sbo.index.get_level_values(1).unique() # In[55]: from utils import generate_prefix_dict # In[56]: pref_dict = generate_prefix_dict(sbo) # In[18]:
def main(x, y, run_inner_fold=True, compute_perm_imp=True): #444 np.random.seed(449) time1 = time.time() x, y = x.copy(), y.copy() enc = (y.reset_index(level=2, drop=True).reset_index().drop_duplicates().set_index( ['mrn', 'id'])) # TODO: figure out if this is a problem y['hsa'] = y.index.get_level_values(2) y.loc[y.any_sbo_surg_enc == 0, 'time_to_event_enc'] = (y.time_to_event_enc.max() - y.loc[y.any_sbo_surg_enc == 0, 'hsa']) y = y.drop('hsa', 1) y.loc[y['time_to_event_enc'] == 0, 'time_to_event_enc'] = 0.01 num_folds, num_lambdas, num_alphas = 5, 20, 3 lambdas = np.array([np.exp(x) for x in np.linspace(-4, 6, num_lambdas)]) #alphas = np.array([x**2 for x in np.linspace(0,1,num_alphas)]).round(3) # Don't need to add zero because just round alphas = np.array([0.1**x for x in np.linspace(0, 4, num_alphas)[::-1]]).round(3) #alphas = np.array([0]) print('\n>>> Generating {} group stratified folds...'.format(num_folds)) group_fold_dict = generate_fold_dict(enc, num_folds, 0.01) inner_group_fold_dict = group_fold_dict.copy() del inner_group_fold_dict[num_folds - 1] x_inner = x.drop(group_fold_dict[num_folds - 1]) y_inner = y.drop(group_fold_dict[num_folds - 1]) time2 = time.time() print('Finished in {}s'.format(round(time2 - time1, 1))) if run_inner_fold: print('\n>>> Running inner CV with {} folds...'.format(num_folds - 1)) inner_perf_arr, inner_betas_arr = inner_cv_cox( x=x_inner, y=y_inner, lambdas=lambdas, alphas=alphas, fold_dict=inner_group_fold_dict) time3 = time.time() print('Fit finished in {}s'.format(round(time3 - time2, 1))) lambda_opt, alpha_opt = get_best_hparams(inner_perf_arr, lambdas, alphas) else: time3 = time.time() inner_perf_arr, inner_betas_arr = None, None lambda_opt, alpha_opt = 0.25, 0.0 #lambda_opt, alpha_opt = 1e-3, 0.0 print( '\n>>> Running outer CV with {} folds, \nlambda* = {}, alpha* = {}...'. format(num_folds, lambda_opt, alpha_opt)) # Cache preprocessing train_means_dict, scalers_dict = cache_preprocessing_info( x, group_fold_dict) fold_generator = generate_fold_datasets(x, y, group_fold_dict, train_means_dict, scalers_dict) betas_arr, perf_arr = outer_cv_cox(x, y, lambda_opt, alpha_opt, group_fold_dict, fold_generator) time4 = time.time() print('Fit finished in {}s'.format(round(time4 - time3, 1))) if compute_perm_imp: print('\n>>> Computing permutation importance...') pref_dict = generate_prefix_dict(x_train) suff_dict = generate_suffix_dict(x_train) mid_dict = generate_midfix_dict(x_train) mid_dict['bp'] = mid_dict['bp_sys'] + mid_dict['bp_dia'] #perm_group_dict = {col:[col] for col in x_samp_full.columns} fold_generator = generate_fold_datasets(x, y, group_fold_dict, train_means_dict, scalers_dict) perm_group_dict = mid_dict perm_imp_df = permutation_importance_cv(x, y, betas_arr, group_fold_dict, fold_generator, perm_group_dict) else: perm_imp_df = None time5 = time.time() print('Fit finished in {}s'.format(round(time5 - time4, 1))) print('\n>>> Make predictions...') fold_generator = generate_fold_datasets(x, y, group_fold_dict, train_means_dict, scalers_dict) y_train_pred, y_test_pred = make_predictions(x, y, betas_arr, group_fold_dict, fold_generator) time6 = time.time() print('Predictions finished in {}s'.format(round(time6 - time5, 1))) """ epsilon, lambdas, alphas, num_folds generate_fold_dict -> group_fold_dict inner_cv_cox -> inner_perf_arr, inner_betas_arr outer_cv_cox -> betas_arr, perf_arr permutation_importance_cv -> perm_imp_df make_predictions -> y_train_pred, y_test_pred """ betas_df = pd.DataFrame(betas_arr, columns=x.columns) result_list = [ lambdas, alphas, lambda_opt, alpha_opt, group_fold_dict, inner_perf_arr, inner_betas_arr, betas_df, perf_arr, perm_imp_df, y_train_pred, y_test_pred, train_means_dict, scalers_dict ] return result_list