def __init__(self, method='bernoulli'): self.method = method.lower() stopifnot(self.method in ['bernoulli', 'gaussian']) self.fun_methods = {'bernoulli':{'mdl':BernoulliNB(fit_prior=False, binarize=0), 'fit':self.fit_bernoulli, 'predict':self.predict_bernoulli}, 'gaussian':{'mdl':GaussianNB(), 'fit':self.fit_gaussian, 'predict':self.predict_gaussian}} #,'fit':self.fit_gaussian
def fit(self, data, lbls, mbatch=100): self.n, self.p = data.shape stopifnot(len(lbls) == self.n) self.enc = normalize() self.enc.fit(data) # Get the one-hot encoders # Make lists of the indices self.lst_enc = [self.enc.cenc, self.enc.nenc] self.lst_cidx = [self.enc.cidx, self.enc.nidx] self.lst_iter = [len(z) > 0 for z in self.lst_cidx] # For each feature we need: sum(x), sum(x**2), sum(x,y) self.fun_methods[self.method]['fit'](data,lbls,mbatch)
def x_batch(data, n, splits, cidx, enc, iter): stopifnot(len(cidx) == len(enc) == len(iter)) xmat = [] for ii in range(n): ridx = splits[ii] holder = [] for jj, check in enumerate(iter): if check: holder.append(enc[jj].transform(data.iloc[ridx, cidx[jj]])) xmat.append(np.hstack(holder)) return(np.vstack(xmat))
def pred_batch(data, n, splits, mdls, cidx, enc, iter): stopifnot(len(cidx) == len(mdls) == len(enc) == len(iter)) pmat = [] for ii in range(n): ridx = splits[ii] holder = [] for jj, check in enumerate(iter): if check: x_ii = enc[jj].transform(data.iloc[ridx, cidx[jj]]) holder.append(mdls[jj].predict_proba(x_ii)[:, 1:]) pmat.append(np.hstack(holder)) return(np.vstack(pmat))
def __init__(self, mbatch=25000, method='lda'): self.mbatch = mbatch self.method = method stopifnot(self.method in ['lda', 'qda']) self.fun_methods = { 'lda': { 'fit': self.fit_lda, 'predict': self.predict_lda }, 'qda': { 'fit': self.fit_qda, 'predict': self.predict_qda } }
def fit(self, X, Y, lam1=0, lam2=0): n, p, k = X.shape + (Y.shape[1],) stopifnot(n == Y.shape[0]) # Fit l2-regularized least squares along columns of X Bhat = np.apply_along_axis(least_squares, 0, Y, *(X, lam1, self.standardize, self.add_intercept)) Eta = X.dot(Bhat[1:]) + Bhat[0].reshape([1, Bhat.shape[1]]) What = Bhat.copy() for jj in range(k): w_jj = fast_logit(Y[:, jj], Eta[:, jj], lam2) What[0, jj] = (w_jj[0] + w_jj[1] * Bhat[0, jj]) What[1:, jj] = w_jj[1] * Bhat[1:, jj] # Zeta = X.dot(What[1:]) + What[0].reshape([1, What.shape[1]]) self.weights = What[1:] self.intercept = What[0].reshape([1,What.shape[1]]) self.p = p self.Bhat = Bhat
def predict(self, data, mbatch=None): stopifnot( data.shape[1] == self.p ) # Check categoreis line up for predict new_vals = [list(np.setdiff1d(data.iloc[:, jj].unique(),uvals)) for jj, uvals in zip(self.enc.cidx,self.enc.cenc.categories_)] diff_vals = np.where(np.array([len(z) for z in new_vals])>0)[0] if len(diff_vals) > 0: data = data.copy() # protect columns from being overwritten print("new categorical values! Setting to default") for jj in diff_vals: cjj = self.enc.cidx[jj] # column in reference data.iloc[:, cjj] = np.where(data.iloc[:,cjj].isin(new_vals[jj]), self.enc.cenc.categories_[jj][0],data.iloc[:, cjj]) # Processing information n_pred = data.shape[0] if mbatch is None: mbatch = n_pred idx_splits = idx_iter(n_pred, mbatch) niter = len(idx_splits) return(self.fun_methods[self.method]['predict'](data,niter,idx_splits))
matplotlib.use('Agg') # no print-outs matplotlib.rcParams['figure.max_open_warning'] = 25 from matplotlib import pyplot as plt import seaborn as sns from support.linreg_wAUC import linreg_wAUC, stochastc_wb_auc import time as ti ############################### # ---- STEP 1: LOAD DATA ---- # dir_base = os.getcwd() dir_output = os.path.join(dir_base, '..', 'output') dir_figures = os.path.join(dir_base, '..', 'figures') [ stopifnot(os.path.exists(z), 'Path does not exist: ' + z) for z in [dir_base, dir_output] ] for pp in [dir_figures]: if not os.path.exists(pp): print('making directory %s' % pp) os.mkdir(pp) dir_auc = os.path.join(dir_output, 'linreg_wAUC') if not os.path.exists(dir_auc): print('making AUC output folder') os.mkdir(dir_auc) dir_weights = os.path.join(dir_output, 'weights') # Labels
def fit(self, data, lbls, nepochs=100, mbatch=1000, val_prop=0.1, lr=0.001): n = data.shape[0] stopifnot(n == lbls.shape[0]) if len(lbls.shape) == 1: lbls = lbls.reshape([n, 1]) self.n_output = lbls.shape[1] check, rr = True, 0 while check: rr += 1 idx_train, idx_val = train_test_split(np.arange(n), test_size=val_prop, random_state=rr) check = not all(lbls.iloc[idx_val].apply( lambda x: x[~(x == -1)].sum(), axis=0) > 0) n_train, n_val = len(idx_train), len(idx_val) self.idx_train = idx_train self.idx_val = idx_val # Find encodings/normalization self.enc = col_encoder() self.enc.fit(data.iloc[idx_train]) self.n_input = len(self.enc.cn_transform) Yval = lbls.iloc[idx_val].values # Pre-compute for faster eval nY_val = np.apply_along_axis(func1d=lambda x: x[~(x == -1)].sum(), axis=0, arr=Yval) nY_train = lbls.iloc[idx_train].apply(lambda x: x[~(x == -1)].sum(), 0).values wY_train = (n_train / nY_train - 1).reshape([1, self.n_output]) # Define architecture torch.manual_seed(1234) self.nnet = net_architecture(n_input=self.n_input, n_output=self.n_output) if self.device == 'cuda': self.nnet.cuda() # Create loss function (note we do not set class because weights will be iterative) loss_fun = nn.BCEWithLogitsLoss # Set up optimizer optimizer = torch.optim.Adagrad(params=self.nnet.parameters(), lr=lr) self.res = [] nll_epoch = [] tstart = time.time() for ii in range(nepochs): idx_batches = idx_iter(n_train, mbatch, ii) nbatch = len(idx_batches) print('---- Epoch %i of %i ----' % (ii + 1, nepochs)) nll_batch = [] nll_batch_cc = [] for jj in range(nbatch): if (jj + 1) % 10 == 0: print('Batch %i of %i' % (jj + 1, nbatch)) idx_jj = idx_train[idx_batches[jj]] optimizer.zero_grad() # --- Forward pass --- # out_jj = self.nnet.forward(self.transform(data.iloc[idx_jj])) Y_jj = lbls.iloc[idx_jj].values W_jj = torch.from_numpy( np.where(Y_jj == -1, 0, 1) * ((Y_jj * wY_train) + 1)).to( self.device).float() Y_jj = torch.from_numpy(Y_jj).to(self.device).float() loss_jj = loss_fun(reduction='mean', weight=W_jj)(input=out_jj, target=Y_jj) # --- Backward pass --- # loss_jj.backward() optimizer.step() nll_batch.append(loss_jj.item()) with torch.no_grad(): loss_jj_cc = loss_fun(reduction='none', weight=W_jj)( input=out_jj, target=Y_jj).mean(axis=0).cpu().detach().numpy() nll_batch_cc.append(loss_jj_cc) nll_epoch.append(np.mean(nll_batch)) #df_nll = pd.DataFrame({'cn':lbls.columns,'y1':nY_train,'nll':np.vstack(nll_batch_cc).mean(axis=0)}) if (ii + 1) % 10 == 0: # Check gradient stability for layer, param in self.nnet.named_parameters(): print('layer: %s, std: %0.4f' % (layer, param.grad.std().item())) # Check for early stopping phat_val = self.predict(data.iloc[idx_val]) holder = [] for cc in range(Yval.shape[1]): idx_cc = ~(Yval[:, cc] == -1) # act_cc = Yval[idx_cc, cc] # pred_cc = sigmoid(phat_val[idx_cc, cc]) holder.append([ roc_auc_score(act_cc, pred_cc), average_precision_score(act_cc, pred_cc), log_loss(act_cc, pred_cc) ]) res_ii = pd.DataFrame(np.vstack(holder), index=lbls.columns, columns=['auc', 'ppv', 'nll']).reset_index() res_ii = pd.concat( [pd.DataFrame({ 'iter': ii + 1, 'n': nY_val }), res_ii], 1) if ii > 10: self.res = pd.concat([self.res, res_ii], 0).reset_index(drop=True) val_score = self.res.drop(columns=['ppv', 'nll']).rename( columns={'index': 'cc'}) val_score = val_score.sort_values( ['cc', 'iter']).reset_index(drop=True) val_score['d_auc'] = ( val_score.auc - val_score.groupby('cc').auc.shift(+1)) * val_score.n val_score = val_score.groupby( 'iter').d_auc.mean().reset_index().fillna(0) if not all(val_score.d_auc >= 0): print('#### EARLY STOPPING AT ITERATION %i ####' % (ii + 1)) break else: print(val_score) else: self.res = res_ii tend = time.time()
import numpy as np import pandas as pd import os from support import support_funs as sf # set up directories dir_base = os.getcwd() dir_output = os.path.join(dir_base,'..','output') sf.stopifnot(all([os.path.exists(x) for x in [dir_output]])) di_lbls = {'cdarrest':'cardiac arrest', 'cnscva':'CVA, stroke or hemorrhage', 'cszre':'seizure', 'civhg':'intraventricular hemorrhage', 'death30yn':'death in 30 days', 'dehis':'deep wound disruption', 'neurodef':'nerve injury', 'oprenafl':'acute renal failure', 'orgspcssi':'organ SSI', 'othbleed':'bleeding or transfusion', 'othclab':'central line infection', 'othseshock':'septic shock', 'othsysep':'sepsis', 'othvt':'ventricular tachycardia', 'oupneumo':'pneumonia', 'readmission1':'unplanned readmission', 'reintub':'unplanned reintubation', 'renainsf':'renal insufficiency', 'reoperation':'unplanned repoeration',
def predict(self, Xnew): stopifnot(Xnew.shape[1] == self.weights.shape[0]) eta = Xnew.dot(self.weights) + self.intercept return (eta)
dir_NSQIP = find_dir_nsqip() dir_output = os.path.join(dir_NSQIP, 'output') dir_models = os.path.join(dir_output, 'models') dir_figures = os.path.join(dir_NSQIP, 'figures') lst_dir = [dir_figures, dir_output, dir_models] assert all([os.path.exists(fold) for fold in lst_dir]) fn_X = 'X_imputed.csv' fn_Y = 'y_agg.csv' dat_X = pd.read_csv(os.path.join(dir_output, fn_X)) #,usecols=['operyr','caseid','cpt'] dat_Y = pd.read_csv(os.path.join(dir_output, fn_Y)) print(dat_X.shape) print(dat_Y.shape) stopifnot(all(dat_X.caseid == dat_Y.caseid)) u_years = dat_X.operyr.unique() # !! ENCODE CPT AS CATEGORICAL !! # dat_X['cpt'] = 'c' + dat_X.cpt.astype(str) cn_X = list(dat_X.columns[2:]) cn_Y = list(dat_Y.columns[2:]) ############################################### # ---- STEP 2: LEAVE-ONE-YEAR - CPT ONLY ---- # holder_vv = [] holder_phat = [] for ii, vv in enumerate(cn_Y): print('##### ------- Outcome %s (%i of %i) -------- #####' % (vv, ii + 1, len(cn_Y))) tmp_ii = pd.concat([dat_Y.operyr, dat_Y[vv] == -1], axis=1)
# load necessary modules import numpy as np import pandas as pd import os import gc from support import support_funs as sf # set up directories dir_base = os.getcwd() dir_output = os.path.join(dir_base, '..', 'output') dir_data = os.path.join(dir_base, '..', 'data') dir_figures = os.path.join(dir_base, '..', 'figures') sf.stopifnot(all([os.path.exists(x) for x in [dir_output, dir_figures]])) # manual list of columns to drop fn = 'combined_raw.csv' # load in the combined data file dat = pd.read_csv(os.path.join(dir_output, fn), encoding='iso-8859-1') dat.sort_values(by='operyr', inplace=True) dat.reset_index(drop=True, inplace=True) #dat.drop(columns=vv_drop,inplace=True) gc.collect() # needed! # # load in the dictionary delvin collated # df_desc = pd.read_csv(os.path.join(dir_output,'master_key.csv')).rename(columns={'variable_label':'desc'}) ################################################### ### ---- (1) convert missing values to nas ---- ###
# load necessary modules import numpy as np import pandas as pd import os import gc from support.support_funs import stopifnot, gg_color_hue from plotnine import * import seaborn as sns from matplotlib import pyplot as plt # set up directories dir_base = os.getcwd() dir_output = os.path.join(dir_base, '..', 'output') dir_figures = os.path.join(dir_base, '..', 'figures') stopifnot(all([os.path.exists(x) for x in [dir_output, dir_figures]])) # Load in the accuracy functions from support.acc_funs import auc, plot_ppv ############################################## ### ---- (1) load in and process data ---- ### # load delvin's output df_scores = pd.read_csv(os.path.join( dir_output, 'maizlin_res_df.csv')).rename(columns={ 'actual_y': 'y', 'model_y': 'yhat' }) # load in associated x_data for each cn_X = ['caseid', 'operyr', 'cpt', 'race', 'sex', 'age_days']
import os from sklearn.model_selection import train_test_split as splitter from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn import metrics # set up directories dir_base = os.getcwd() dir_output = os.path.join(dir_base, '..', 'output') dir_data = os.path.join(dir_base, '..', 'data') dir_figures = os.path.join(dir_base, '..', 'figures') from support import naive_bayes as mf from support import support_funs as sf import support.acc_funs as af sf.stopifnot(all([os.path.exists(x) for x in [dir_output, dir_figures]])) ############################################## ### ---- (1) LOAD IN AND PROCESS DATA ---- ### fn_y = 'y_bin.csv' fn_X = 'X_preop.csv' #if fn_X not in os.listdir(dir_output): # fn_X = 'X_preop.csv' y_df = pd.read_csv(os.path.join(dir_output, fn_y)) X_df = pd.read_csv(os.path.join(dir_output, fn_X)) sf.stopifnot((y_df.shape[0] == X_df.shape[0]) & all(y_df.caseid == X_df.caseid)) # --- (!) Encode CPT as string --- # X_df['cpt'] = X_df.cpt.astype(str)
# Set directories dir_NSQIP = find_dir_nsqip() dir_output = os.path.join(dir_NSQIP, 'output') assert os.path.exists(dir_output) dir_figures = os.path.join(dir_NSQIP, 'figures') makeifnot(dir_figures) dir_weights = os.path.join(dir_output, 'weights') makeifnot(dir_weights) fn_X = 'X_imputed.csv' fn_Y = 'y_agg.csv' dat_X = pd.read_csv(os.path.join(dir_output, fn_X)) dat_Y = pd.read_csv(os.path.join(dir_output, fn_Y)) print(dat_X.shape) print(dat_Y.shape) stopifnot(all(dat_X.caseid == dat_Y.caseid)) u_years = dat_X.operyr.unique() # !! ENCODE CPT AS CATEGORICAL !! # dat_X['cpt'] = 'c' + dat_X.cpt.astype(str) cn_X = list(dat_X.columns[2:]) # Split Y into the agg vs not dat_agg = dat_Y.loc[:, dat_Y.columns.str.contains('^agg|caseid|operyr')] dat_Y = dat_Y.loc[:, ~dat_Y.columns.str.contains('^agg')] cn_Y = list(dat_Y.columns[2:]) cn_agg = list(dat_agg.columns[2:]) # # If we use 2012/13 as baseline years, what is the y-prop? # prop_Y = dat_Y.groupby('operyr')[cn_Y].apply(lambda x: x[~(x==-1)].mean()).reset_index() # prop_Y = prop_Y.melt('operyr',var_name='outcome') # tmp = dat_Y.groupby('operyr')[cn_Y].apply(lambda x: (x==-1).sum()).reset_index().melt('operyr',