def __init__(self, method='bernoulli'):
     self.method = method.lower()
     stopifnot(self.method in ['bernoulli', 'gaussian'])
     self.fun_methods = {'bernoulli':{'mdl':BernoulliNB(fit_prior=False, binarize=0),
                                      'fit':self.fit_bernoulli,
                                      'predict':self.predict_bernoulli},
                         'gaussian':{'mdl':GaussianNB(),
                                     'fit':self.fit_gaussian,
                                     'predict':self.predict_gaussian}} #,'fit':self.fit_gaussian
 def fit(self, data, lbls, mbatch=100):
     self.n, self.p = data.shape
     stopifnot(len(lbls) == self.n)
     self.enc = normalize()
     self.enc.fit(data)  # Get the one-hot encoders
     # Make lists of the indices
     self.lst_enc = [self.enc.cenc, self.enc.nenc]
     self.lst_cidx = [self.enc.cidx, self.enc.nidx]
     self.lst_iter = [len(z) > 0 for z in self.lst_cidx]
     # For each feature we need: sum(x), sum(x**2), sum(x,y)
     self.fun_methods[self.method]['fit'](data,lbls,mbatch)
def x_batch(data, n, splits, cidx, enc, iter):
    stopifnot(len(cidx) == len(enc) == len(iter))
    xmat = []
    for ii in range(n):
        ridx = splits[ii]
        holder = []
        for jj, check in enumerate(iter):
            if check:
                holder.append(enc[jj].transform(data.iloc[ridx, cidx[jj]]))
        xmat.append(np.hstack(holder))
    return(np.vstack(xmat))
def pred_batch(data, n, splits, mdls, cidx, enc, iter):
    stopifnot(len(cidx) == len(mdls) == len(enc) == len(iter))
    pmat = []
    for ii in range(n):
        ridx = splits[ii]
        holder = []
        for jj, check in enumerate(iter):
            if check:
                x_ii = enc[jj].transform(data.iloc[ridx, cidx[jj]])
                holder.append(mdls[jj].predict_proba(x_ii)[:, 1:])
        pmat.append(np.hstack(holder))
    return(np.vstack(pmat))
 def __init__(self, mbatch=25000, method='lda'):
     self.mbatch = mbatch
     self.method = method
     stopifnot(self.method in ['lda', 'qda'])
     self.fun_methods = {
         'lda': {
             'fit': self.fit_lda,
             'predict': self.predict_lda
         },
         'qda': {
             'fit': self.fit_qda,
             'predict': self.predict_qda
         }
     }
Exemple #6
0
 def fit(self, X, Y, lam1=0, lam2=0):
     n, p, k = X.shape + (Y.shape[1],)
     stopifnot(n == Y.shape[0])
     # Fit l2-regularized least squares along columns of X
     Bhat = np.apply_along_axis(least_squares, 0, Y,
                     *(X, lam1, self.standardize, self.add_intercept))
     Eta = X.dot(Bhat[1:]) + Bhat[0].reshape([1, Bhat.shape[1]])
     What = Bhat.copy()
     for jj in range(k):
         w_jj = fast_logit(Y[:, jj], Eta[:, jj], lam2)
         What[0, jj] = (w_jj[0] + w_jj[1] * Bhat[0, jj])
         What[1:, jj] = w_jj[1] * Bhat[1:, jj]
     # Zeta = X.dot(What[1:]) + What[0].reshape([1, What.shape[1]])
     self.weights = What[1:]
     self.intercept = What[0].reshape([1,What.shape[1]])
     self.p = p
     self.Bhat = Bhat
 def predict(self, data, mbatch=None):
     stopifnot( data.shape[1] == self.p )
     # Check categoreis line up for predict
     new_vals = [list(np.setdiff1d(data.iloc[:, jj].unique(),uvals)) for
         jj, uvals in zip(self.enc.cidx,self.enc.cenc.categories_)]
     diff_vals = np.where(np.array([len(z) for z in new_vals])>0)[0]
     if len(diff_vals) > 0:
         data = data.copy() # protect columns from being overwritten
         print("new categorical values! Setting to default")
         for jj in diff_vals:
             cjj = self.enc.cidx[jj] # column in reference
             data.iloc[:, cjj] = np.where(data.iloc[:,cjj].isin(new_vals[jj]),
                         self.enc.cenc.categories_[jj][0],data.iloc[:, cjj])
     # Processing information
     n_pred = data.shape[0]
     if mbatch is None:
         mbatch = n_pred
     idx_splits = idx_iter(n_pred, mbatch)
     niter = len(idx_splits)
     return(self.fun_methods[self.method]['predict'](data,niter,idx_splits))
matplotlib.use('Agg')  # no print-outs
matplotlib.rcParams['figure.max_open_warning'] = 25
from matplotlib import pyplot as plt
import seaborn as sns

from support.linreg_wAUC import linreg_wAUC, stochastc_wb_auc
import time as ti

###############################
# ---- STEP 1: LOAD DATA ---- #

dir_base = os.getcwd()
dir_output = os.path.join(dir_base, '..', 'output')
dir_figures = os.path.join(dir_base, '..', 'figures')
[
    stopifnot(os.path.exists(z), 'Path does not exist: ' + z)
    for z in [dir_base, dir_output]
]
for pp in [dir_figures]:
    if not os.path.exists(pp):
        print('making directory %s' % pp)
        os.mkdir(pp)

dir_auc = os.path.join(dir_output, 'linreg_wAUC')
if not os.path.exists(dir_auc):
    print('making AUC output folder')
    os.mkdir(dir_auc)

dir_weights = os.path.join(dir_output, 'weights')

# Labels
    def fit(self,
            data,
            lbls,
            nepochs=100,
            mbatch=1000,
            val_prop=0.1,
            lr=0.001):
        n = data.shape[0]
        stopifnot(n == lbls.shape[0])
        if len(lbls.shape) == 1:
            lbls = lbls.reshape([n, 1])
        self.n_output = lbls.shape[1]
        check, rr = True, 0
        while check:
            rr += 1
            idx_train, idx_val = train_test_split(np.arange(n),
                                                  test_size=val_prop,
                                                  random_state=rr)
            check = not all(lbls.iloc[idx_val].apply(
                lambda x: x[~(x == -1)].sum(), axis=0) > 0)
        n_train, n_val = len(idx_train), len(idx_val)
        self.idx_train = idx_train
        self.idx_val = idx_val
        # Find encodings/normalization
        self.enc = col_encoder()
        self.enc.fit(data.iloc[idx_train])
        self.n_input = len(self.enc.cn_transform)
        Yval = lbls.iloc[idx_val].values  # Pre-compute for faster eval
        nY_val = np.apply_along_axis(func1d=lambda x: x[~(x == -1)].sum(),
                                     axis=0,
                                     arr=Yval)
        nY_train = lbls.iloc[idx_train].apply(lambda x: x[~(x == -1)].sum(),
                                              0).values
        wY_train = (n_train / nY_train - 1).reshape([1, self.n_output])
        # Define architecture
        torch.manual_seed(1234)
        self.nnet = net_architecture(n_input=self.n_input,
                                     n_output=self.n_output)
        if self.device == 'cuda':
            self.nnet.cuda()
        # Create loss function (note we do not set class because weights will be iterative)
        loss_fun = nn.BCEWithLogitsLoss
        # Set up optimizer
        optimizer = torch.optim.Adagrad(params=self.nnet.parameters(), lr=lr)
        self.res = []
        nll_epoch = []
        tstart = time.time()
        for ii in range(nepochs):
            idx_batches = idx_iter(n_train, mbatch, ii)
            nbatch = len(idx_batches)
            print('---- Epoch %i of %i ----' % (ii + 1, nepochs))
            nll_batch = []
            nll_batch_cc = []
            for jj in range(nbatch):
                if (jj + 1) % 10 == 0:
                    print('Batch %i of %i' % (jj + 1, nbatch))
                idx_jj = idx_train[idx_batches[jj]]
                optimizer.zero_grad()
                # --- Forward pass --- #
                out_jj = self.nnet.forward(self.transform(data.iloc[idx_jj]))
                Y_jj = lbls.iloc[idx_jj].values
                W_jj = torch.from_numpy(
                    np.where(Y_jj == -1, 0, 1) * ((Y_jj * wY_train) + 1)).to(
                        self.device).float()
                Y_jj = torch.from_numpy(Y_jj).to(self.device).float()
                loss_jj = loss_fun(reduction='mean', weight=W_jj)(input=out_jj,
                                                                  target=Y_jj)
                # --- Backward pass --- #
                loss_jj.backward()
                optimizer.step()
                nll_batch.append(loss_jj.item())
                with torch.no_grad():
                    loss_jj_cc = loss_fun(reduction='none', weight=W_jj)(
                        input=out_jj,
                        target=Y_jj).mean(axis=0).cpu().detach().numpy()
                    nll_batch_cc.append(loss_jj_cc)
            nll_epoch.append(np.mean(nll_batch))
            #df_nll = pd.DataFrame({'cn':lbls.columns,'y1':nY_train,'nll':np.vstack(nll_batch_cc).mean(axis=0)})
            if (ii + 1) % 10 == 0:
                # Check gradient stability
                for layer, param in self.nnet.named_parameters():
                    print('layer: %s, std: %0.4f' %
                          (layer, param.grad.std().item()))
                # Check for early stopping
                phat_val = self.predict(data.iloc[idx_val])
                holder = []
                for cc in range(Yval.shape[1]):
                    idx_cc = ~(Yval[:, cc] == -1)  #
                    act_cc = Yval[idx_cc, cc]  #
                    pred_cc = sigmoid(phat_val[idx_cc, cc])
                    holder.append([
                        roc_auc_score(act_cc, pred_cc),
                        average_precision_score(act_cc, pred_cc),
                        log_loss(act_cc, pred_cc)
                    ])
                res_ii = pd.DataFrame(np.vstack(holder),
                                      index=lbls.columns,
                                      columns=['auc', 'ppv',
                                               'nll']).reset_index()
                res_ii = pd.concat(
                    [pd.DataFrame({
                        'iter': ii + 1,
                        'n': nY_val
                    }), res_ii], 1)
                if ii > 10:
                    self.res = pd.concat([self.res, res_ii],
                                         0).reset_index(drop=True)
                    val_score = self.res.drop(columns=['ppv', 'nll']).rename(
                        columns={'index': 'cc'})
                    val_score = val_score.sort_values(
                        ['cc', 'iter']).reset_index(drop=True)
                    val_score['d_auc'] = (
                        val_score.auc -
                        val_score.groupby('cc').auc.shift(+1)) * val_score.n
                    val_score = val_score.groupby(
                        'iter').d_auc.mean().reset_index().fillna(0)
                    if not all(val_score.d_auc >= 0):
                        print('#### EARLY STOPPING AT ITERATION %i ####' %
                              (ii + 1))
                        break
                    else:
                        print(val_score)
                else:
                    self.res = res_ii

        tend = time.time()
Exemple #10
0
import numpy as np
import pandas as pd
import os

from support import support_funs as sf

# set up directories
dir_base = os.getcwd()
dir_output = os.path.join(dir_base,'..','output')
sf.stopifnot(all([os.path.exists(x) for x in [dir_output]]))


di_lbls = {'cdarrest':'cardiac arrest',
            'cnscva':'CVA, stroke or hemorrhage',
            'cszre':'seizure',
            'civhg':'intraventricular hemorrhage',
            'death30yn':'death in 30 days',
            'dehis':'deep wound disruption',
            'neurodef':'nerve injury',
            'oprenafl':'acute renal failure',
            'orgspcssi':'organ SSI',
            'othbleed':'bleeding or transfusion',
            'othclab':'central line infection',
            'othseshock':'septic shock',
            'othsysep':'sepsis',
            'othvt':'ventricular tachycardia',
            'oupneumo':'pneumonia',
            'readmission1':'unplanned readmission',
            'reintub':'unplanned reintubation',
            'renainsf':'renal insufficiency',
            'reoperation':'unplanned repoeration',
Exemple #11
0
 def predict(self, Xnew):
     stopifnot(Xnew.shape[1] == self.weights.shape[0])
     eta = Xnew.dot(self.weights) + self.intercept
     return (eta)
dir_NSQIP = find_dir_nsqip()
dir_output = os.path.join(dir_NSQIP, 'output')
dir_models = os.path.join(dir_output, 'models')
dir_figures = os.path.join(dir_NSQIP, 'figures')
lst_dir = [dir_figures, dir_output, dir_models]
assert all([os.path.exists(fold) for fold in lst_dir])

fn_X = 'X_imputed.csv'
fn_Y = 'y_agg.csv'
dat_X = pd.read_csv(os.path.join(dir_output,
                                 fn_X))  #,usecols=['operyr','caseid','cpt']
dat_Y = pd.read_csv(os.path.join(dir_output, fn_Y))
print(dat_X.shape)
print(dat_Y.shape)
stopifnot(all(dat_X.caseid == dat_Y.caseid))
u_years = dat_X.operyr.unique()
# !! ENCODE CPT AS CATEGORICAL !! #
dat_X['cpt'] = 'c' + dat_X.cpt.astype(str)
cn_X = list(dat_X.columns[2:])
cn_Y = list(dat_Y.columns[2:])

###############################################
# ---- STEP 2: LEAVE-ONE-YEAR - CPT ONLY ---- #

holder_vv = []
holder_phat = []
for ii, vv in enumerate(cn_Y):
    print('##### ------- Outcome %s (%i of %i) -------- #####' %
          (vv, ii + 1, len(cn_Y)))
    tmp_ii = pd.concat([dat_Y.operyr, dat_Y[vv] == -1], axis=1)
# load necessary modules
import numpy as np
import pandas as pd
import os
import gc

from support import support_funs as sf

# set up directories
dir_base = os.getcwd()
dir_output = os.path.join(dir_base, '..', 'output')
dir_data = os.path.join(dir_base, '..', 'data')
dir_figures = os.path.join(dir_base, '..', 'figures')
sf.stopifnot(all([os.path.exists(x) for x in [dir_output, dir_figures]]))

# manual list of columns to drop

fn = 'combined_raw.csv'
# load in the combined data file
dat = pd.read_csv(os.path.join(dir_output, fn), encoding='iso-8859-1')
dat.sort_values(by='operyr', inplace=True)
dat.reset_index(drop=True, inplace=True)
#dat.drop(columns=vv_drop,inplace=True)
gc.collect()  # needed!

# # load in the dictionary delvin collated
# df_desc = pd.read_csv(os.path.join(dir_output,'master_key.csv')).rename(columns={'variable_label':'desc'})

###################################################
### ---- (1) convert missing values to nas ---- ###
# load necessary modules
import numpy as np
import pandas as pd
import os
import gc
from support.support_funs import stopifnot, gg_color_hue
from plotnine import *

import seaborn as sns
from matplotlib import pyplot as plt

# set up directories
dir_base = os.getcwd()
dir_output = os.path.join(dir_base, '..', 'output')
dir_figures = os.path.join(dir_base, '..', 'figures')
stopifnot(all([os.path.exists(x) for x in [dir_output, dir_figures]]))

# Load in the accuracy functions
from support.acc_funs import auc, plot_ppv

##############################################
### ---- (1) load in and process data ---- ###

# load delvin's output
df_scores = pd.read_csv(os.path.join(
    dir_output, 'maizlin_res_df.csv')).rename(columns={
        'actual_y': 'y',
        'model_y': 'yhat'
    })
# load in associated x_data for each
cn_X = ['caseid', 'operyr', 'cpt', 'race', 'sex', 'age_days']
Exemple #15
0
import os
from sklearn.model_selection import train_test_split as splitter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics

# set up directories
dir_base = os.getcwd()
dir_output = os.path.join(dir_base, '..', 'output')
dir_data = os.path.join(dir_base, '..', 'data')
dir_figures = os.path.join(dir_base, '..', 'figures')

from support import naive_bayes as mf
from support import support_funs as sf
import support.acc_funs as af
sf.stopifnot(all([os.path.exists(x) for x in [dir_output, dir_figures]]))

##############################################
### ---- (1) LOAD IN AND PROCESS DATA ---- ###

fn_y = 'y_bin.csv'
fn_X = 'X_preop.csv'
#if fn_X not in os.listdir(dir_output):
#    fn_X = 'X_preop.csv'

y_df = pd.read_csv(os.path.join(dir_output, fn_y))
X_df = pd.read_csv(os.path.join(dir_output, fn_X))
sf.stopifnot((y_df.shape[0] == X_df.shape[0])
             & all(y_df.caseid == X_df.caseid))
# --- (!) Encode CPT as string --- #
X_df['cpt'] = X_df.cpt.astype(str)
Exemple #16
0
# Set directories
dir_NSQIP = find_dir_nsqip()
dir_output = os.path.join(dir_NSQIP, 'output')
assert os.path.exists(dir_output)
dir_figures = os.path.join(dir_NSQIP, 'figures')
makeifnot(dir_figures)
dir_weights = os.path.join(dir_output, 'weights')
makeifnot(dir_weights)

fn_X = 'X_imputed.csv'
fn_Y = 'y_agg.csv'
dat_X = pd.read_csv(os.path.join(dir_output, fn_X))
dat_Y = pd.read_csv(os.path.join(dir_output, fn_Y))
print(dat_X.shape)
print(dat_Y.shape)
stopifnot(all(dat_X.caseid == dat_Y.caseid))
u_years = dat_X.operyr.unique()
# !! ENCODE CPT AS CATEGORICAL !! #
dat_X['cpt'] = 'c' + dat_X.cpt.astype(str)
cn_X = list(dat_X.columns[2:])

# Split Y into the agg vs not
dat_agg = dat_Y.loc[:, dat_Y.columns.str.contains('^agg|caseid|operyr')]
dat_Y = dat_Y.loc[:, ~dat_Y.columns.str.contains('^agg')]
cn_Y = list(dat_Y.columns[2:])
cn_agg = list(dat_agg.columns[2:])

# # If we use 2012/13 as baseline years, what is the y-prop?
# prop_Y = dat_Y.groupby('operyr')[cn_Y].apply(lambda x: x[~(x==-1)].mean()).reset_index()
# prop_Y = prop_Y.melt('operyr',var_name='outcome')
# tmp = dat_Y.groupby('operyr')[cn_Y].apply(lambda x: (x==-1).sum()).reset_index().melt('operyr',