def explore(self, X, y): path = os.path.join(self.path, self.solver, self.activation) print '--MLP: explore' # specify parameters for exploration if self.solver == 'lbfgs': param_dist = { 'alpha': Log10Flat(-10, -0.001), 'tol': Log10Flat(-10, -0.001), } pass elif self.solver == 'adam': param_dist = { 'alpha': Log10Flat(-10, -0.001), 'tol': Log10Flat(-10, -0.001), 'beta_1': Log10Flat(-10, -0.001), 'beta_2': Log10Flat(-10, -0.001), } pass else: raise KeyError("explore solver '%s' not implementd" % self.solver) # tune classifier tuning.tune( self, X, y, param_dist, path=path, jobs=1, ## bug in MLPClassifier!! random_state=self.random_state) pass
def explore(self, X, y): path = self.path # specify parameters for exploration param_dist = { 'reg_param': Log10Flat(-10, -0.001), 'tol': Log10Flat(-10, -0.001), } # tune classifier tuning.tune(self, X, y, param_dist, path=path, jobs=self.jobs, random_state=self.random_state) pass
def explore(self, X, y): path = os.path.join(self.path, self.kernel, 'shrinking_%s' % str(self.shrinking)) print '--SVC: explore' # specify parameters for exploration if self.kernel == 'rbf': param_dist = { 'C': Log10Flat(-10, -0.001), 'gamma': Log10Flat(-10, -0.001), } pass else: raise KeyError("explore unknown kernel '%s' " % self.kernel) # tune classifier tuning.tune(self, X, y, param_dist, path=path, jobs=self.jobs, random_state=self.random_state) pass
def explore(self, X, y): # generate unique path dependent on core algorithm path = os.path.join(self.path, self.solver) print "--LDA: explore" # specify parameters for exploration if self.solver == 'svd': param_dist = {'tol': Log10Flat(-10, -0.001)} pass else: param_dist = { 'shrinkage': Log10Flat(-10, -0.001), } pass # tune classifier tuning.tune(self, X, y, param_dist, path=path, jobs=self.jobs, random_state=self.random_state) pass
def train(rseed): (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = load_eicu( split_seed=rseed) strain, svalid, stest = [ s['apacheiva'].values for s in aps_baselines(split_seed=rseed) ] #[extras['apacheiva'][k] for k in ['train','valid','test']] Xtv = pd.concat([Xtrain, Xvalid]) ytv = np.hstack((ytrain, yvalid)) stv = np.hstack((strain, svalid)) model = tune(Xtrain, Xvalid, strain, svalid, mtype=prob_regressor, predfunc=lambda m, X: m.predict(X), scorefunc=lambda y, p: -soft_xent(y, p)) # Print performance vbst = prob_regressor(**model) vbst.fit(Xtrain, strain) print( f'Rank correlation: {stats.spearmanr(stest,vbst.predict(Xtest)).correlation:.3f}' ) bst = prob_regressor(**model) bst.fit(Xtv, stv) exp = OneDimExplainer(bst) shaps = exp.shap_values(Xtv) global_importance = np.abs(shaps).mean(0) np.save( f'eicu_importance_{rseed}.npy', np.vstack( (np.array(Xtrain.columns).astype('object'), global_importance)))
def train(dname, mname, rseed, shuffle_params=None): # ICU preprocessigng is now in its own function # mtype = MTYPES[mname] # kwargs = {} # if dname=='icu' and ('linear' in mname or 'nn' in mname or 'cwcf' in mname): kwargs['onehot']=True # CWCF now runs in parallel across several GPUs # if mname=='cwcf' and 'CUDA_VISIBLE_DEVICES' not in os.environ: # ngpu = len(tf.config.list_physical_devices('GPU')) # cur_gpu = rseed%ngpu if rseed is not None else 0 # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152 # os.environ["CUDA_VISIBLE_DEVICES"]=str(cur_gpu) ################## # DATA ################## # Load data we're using (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname]( split_seed=rseed) #,**kwargs) # If we're using PACT we need some of the extra (redundant) features that were unused in our study if mname == 'pact': (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = load_ed( name=config.ED_NAME, costtype=config.ED_COSTTYPE, drop_redundant=False, split_seed=rseed) # print([(n,c) for n,c in zip(Xtrain.columns,costs) if c>0.01]) # Xtrain_raw,Xvalid_raw,Xtest_raw = Xtrain,Xvalid,Xtest # For bootstrapping, we don't do this anymore and do train/test splits instead # Xtrain_raw, ytrain = bootstrap_set(Xtrain,ytrain,rseed=rseed) # Xvalid_raw, yvalid = bootstrap_set(Xvalid,yvalid,rseed=rseed) # Xtest_raw, ytest = bootstrap_set(Xtest,ytest,rseed=rseed) # If we're using a non-GBM AI method, we need to impute NaNs and scale # Don't do this if using ICU data because we're using a Pipeline in that case # that handles this stuff if ('linear' in mname or 'nn' in mname or 'cwcf' in mname or 'node' in mname) and (dname != 'icu'): imputer = impute.SimpleImputer() scaler = preprocessing.StandardScaler() Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain)) Xvalid_np = scaler.transform(imputer.transform(Xvalid)) Xtest_np = scaler.transform(imputer.transform(Xtest)) for df, npy in zip([Xtrain, Xvalid, Xtest], [Xtrain_np, Xvalid_np, Xtest_np]): df.iloc[:] = npy # Hackier code for preprocessing features, can probably remove # Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip( # [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])] # else: # (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw # Concatenated data for training cost-aware models after tuning Xtv = pd.concat([Xtrain, Xvalid]) ytv = np.hstack((ytrain, yvalid)) # Grouped costs for datasets tht feature it # Outpatient dataset # Or linear/NN on ICU (one-hot encoding of admission dx) unique_costs = np.array([ costs[groups == g].mean() for g in np.unique(groups) ]) if (dname == 'outpatient') or ( dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs ################## # PARAMETER TUNING ################## # If we've precomputed best parameters, just load those if TUNING == 'LOAD' and (('gbm' in mname) or (mname in ('fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('cegb' in mname)): loadname = 'gbmsage' if ((mname in ('fixedmodel', 'imputemodel')) or ('cegb' in mname) or ('gbmsage' in mname)) else mname with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w: model = pickle.load(w) # Otherwise do some parameter tuning else: # Tune GBM if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): model = tune(Xtrain, Xvalid, ytrain, yvalid) # Linear model needs onehotencoding pipeline if we're doing ICU elif ('linear' in mname): if (dname == 'icu'): model = lintune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_linear_model)) else: model = lintune(Xtrain, Xvalid, ytrain, yvalid) # NN model needs onehotencoding pipeline if we're doing ICU elif 'nn' in mname: if (dname == 'icu'): model = tftune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_tf_model), return_extras=False) else: model = tftune(Xtrain, Xvalid, ytrain, yvalid, return_extras=False) # NODE model doesn't need tuning elif 'node' in mname: model = {} # If we indicated we want to save the model, do so # print(model) if TUNING == 'SAVE' and (('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname)): with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w: pickle.dump(model, w) exit() # Limit number of jobs for processor-hungry models print(mname) if mname not in ('qsofa', 'aps', 'apacheiii', 'apacheiva'): if (('gbm' in mname) or ('cegb' in mname) or ('linear' in mname) or ('imputemodel' in mname)): model['n_jobs'] = 4 if dname == 'trauma' else 2 # else: model['n_jobs']=10 ################## # Setup for CoAI ################## # Instantiate predictive models if ('gbm' in mname) or ('cegb' in mname) or (mname in ('fixedmodel', 'imputemodel')): bst = lgb.LGBMClassifier(**model) elif 'linear' in mname: bst = icu_preprocessing(FastLinearClassifier)( **model) if dname == 'icu' else FastLinearClassifier(**model) elif 'nn' in mname: bst = icu_preprocessing(get_fast_keras)( **model) if dname == 'icu' else get_fast_keras(**model) elif 'node' in mname: bst = icu_preprocessing(NodeClassifier)( experiment_name=f'trauma{rseed}', ** model) if dname == 'icu' else NodeClassifier( experiment_name=f'trauma{rseed}', **model) # Get our explainer (using SAGE entirely now, shap code is old & may not work perfectly) if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): #sage_params={'imputetype':'default'} #if 'gbm' in mname: sage_params={'imputetype':'marginal'} # SAGE explainer. N_permutations set super low for NODE bc we're # just testing it right now exp = labelless_sage_wrapper( imputetype='marginal', refsize=64, batch_size=32, wrap_categorical=(dname == 'icu'), n_permutations=(128 if 'node' in mname else None)) # NODE debugging line # print(dict(imputetype=('default' if 'node' in mname else 'marginal'),refsize=(1 if 'node' in mname else 64))) # Mostly deprecated elif mname == 'gbmshap': exp = OneDimExplainer elif mname == 'linearshap': exp = get_pipeline_explainer(LinearExplainer) # Prepare to perturb costs if required (robustness experiments) if shuffle_params is not None: # Negative numbers indicate individiual robustness if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)): costs, shuffle_costs = cost_pair(-shuffle_params[0], -shuffle_params[1], Xtrain) # Positive indicate swap robustness - # swaps and seed else: shuffle_costs = cost_swaps(costs, shuffle_params[0], shuffle_params[1]) # Pick thresholds for CoAI dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100) ##################### # Actually train/test ##################### if 'sage' in mname or 'shap' in mname: # Wrap model with CoAI if 'greedy' in mname: GRP = knapsack.GroupGreedy(bst, exp) else: GRP = knapsack.GroupOptimizer(bst, exp, scale_ints=1000 * 100 if ('sage' in mname) else 1000) # NN needs preprocessing pipeline if ICU, also pass # epochs, verbosity if 'nn' in mname: if dname == 'icu': GRP.fit(Xtv, ytv, costs, groups, dthresh, model__epochs=10, model__verbose=False) else: GRP.fit(Xtv, ytv, costs, groups, dthresh, epochs=10, verbose=False) # NODE needs preprocessing for ICU. # Also requires eval set for stopping time # Current max_iter is short for prototyping elif 'node' in mname: dthresh = np.linspace(0, np.sum(unique_costs) + 1, 10) if dname == 'icu': GRP.fit(Xtrain, ytrain, costs, groups, dthresh, model__eval_set=(Xvalid, yvalid), model__max_iter=15) else: GRP.fit(Xtrain, ytrain, costs, groups, dthresh, eval_set=(Xvalid, yvalid), max_iter=15) # All other CoAI methods get a standardized fit process else: GRP.fit(Xtv, ytv, costs, groups, dthresh) # Evaluate CoAI models GRP.score_models_proba(Xtest, ytest, roc_auc_score) # If costs get shuffled, each model's deployment cost will change if shuffle_params: GRP.recalculate_costs(shuffle_costs) # Impute-CoAI with mean imputation elif 'fixed' in mname: bst = bst.fit(Xtv, ytv) GRP = knapsack.FixedModelExactRetainer(bst, exp) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) # Impute-CoAI with model-based imputation (IterativeImputer) elif 'impute' in mname: imputer = impute.IterativeImputer(random_state=0, estimator=linear_model.RidgeCV()) bst = bst.fit(Xtv, ytv) imputer.fit(Xtv) GRP = knapsack.FixedModelImputer(bst, exp, imputer) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) # GRP.fit(Xtv,ytv,costs,groups,dthresh) if mname=='default' else GRP.fit(Xtv,ytv,costs,dthresh) # CEGB doesn't use an explainer elif ('cegb' in mname): GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101)) GRP.fit(Xtv, ytv, costs, groups=(groups if 'group' in mname else None)) GRP.score_models_proba(Xtest, ytest, roc_auc_score) # Account for grouped costs if in outpatient data if (dname == 'outpatient'): GRP.recalculate_costs(costs, groups) # Account for any cost perturbations if shuffle_params: GRP.recalculate_costs(shuffle_costs) elif ('cwcf' in mname): # Lots of preprocessing if using ICU data to encode categoricals # as ordinal ints (save memory, handle groups, etc) if dname == 'icu': types = Xtrain.dtypes for col in Xtrain.columns: if str(types[col]) == 'category': l_enc = preprocessing.OrdinalEncoder( handle_unknown='use_encoded_value', unknown_value=np.nan) for df in [Xtrain, Xvalid, Xtest]: if 'UNK' not in df[col].cat.categories: df[col].cat.add_categories(['UNK'], inplace=True) df[col].fillna('UNK', inplace=True) Xtrain[col] = l_enc.fit_transform( np.array(Xtrain[col]).reshape(-1, 1)) Xvalid[col] = l_enc.transform( np.array(Xvalid[col]).reshape(-1, 1)) Xtest[col] = l_enc.transform( np.array(Xtest[col]).reshape(-1, 1)) # Old mode imputation code, better now (broken by dtype) # for df in [Xtrain,Xvalid,Xtest]: # if df[col].isna().any(): # df[col][df[col].isna()] = Xtrain[col].mode().iloc[0] # Xtrain[col] = Xtrain[col].fillna(Xtrain[col].mode().iloc[0]) # Xvalid[col] = Xvalid[col].fillna(Xtrain[col].mode().iloc[0]) # Xtest[col] = Xtest[col].fillna(Xtrain[col].mode().iloc[0]) elif str(types[col]) == 'int64': Xtrain[col].fillna(Xtrain[col].mode(), inplace=True) Xvalid[col].fillna(Xtrain[col].mode(), inplace=True) Xtest[col].fillna(Xtrain[col].mode(), inplace=True) else: Xtrain[col].fillna(Xtrain[col].mean(), inplace=True) Xvalid[col].fillna(Xtrain[col].mean(), inplace=True) Xtest[col].fillna(Xtrain[col].mean(), inplace=True) # CWCF only takes nparrays for labels ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)] print('Training CWCF...' ) # So we know when jobs get farmed out to other processes # Used to turn "groups" down to 6 for outpatient just to prototype group support if 'lagrange' in mname: data_lmbds = { 'trauma': np.linspace(0, np.sum(unique_costs), 17)[1:], 'icu': np.linspace(0, np.sum(unique_costs), 17)[1:], 'outpatient': np.linspace(0, np.sum(unique_costs), 17)[1:] } else: data_lmbds = { 'trauma': np.logspace(-14, 1, 16), 'icu': np.logspace(-14, 1, 16), 'outpatient': np.logspace(-14, 1, 16) } # This is usually range(2) to get some stability over reps -- doesn't matter as much for outpatient # Can turn down to 1 when prototyping lmbds = np.hstack([data_lmbds[dname] for _ in range(2)]) # Old single threaded mode # GRP = cwcf.CWCFClassifier(costs=costs,dirname=config.CWCF_TMPDIR) # GRP.fit(Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest) # print([x.shape for x in (Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest,costs,lmbds)]) # Run CWCF - groups argument does experimental groups handling (not working yet) # More jobs (even more than GPUs) can be used - gets you through the lambda list faster # Set up right now for L3 gpus 1-6. GRP = cwcf.get_cwcf(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs, lmbds, gpus=np.random.permutation(8), njobs=16, dirname=config.CWCF_TMPDIR, lagrange=('lagrange' in mname), metric=roc_auc_score, difficulty=1000, groups=(groups if 'group' in mname else None)) print('Done') # Done with external process run # ICU baselines elif mname in ('aps', 'apacheiii', 'apacheiva'): strain, svalid, stest = aps_baselines(split_seed=rseed) mpreds = stest # mpreds = bootstrap_set(mpreds,rseed=rseed) preds = mpreds[mname] score = roc_auc_score(ytest, preds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(preds) elif mname in ('qsofa'): qtest = qsofa_score(split_seed=rseed) qpreds = qtest #bootstrap_set(qtest,rseed=rseed) score = roc_auc_score(ytest, qpreds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(qpreds) # Trauma baseline (PACT) # Should ignore the resulting cost for now and just use # the hand-calculated one elif mname in ('pact'): cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs) GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score) GRP.test_preds = np.array(preds) else: raise ValueError("Model name not found!") # Done return GRP #(GRP.model_costs, GRP.model_scores)
def train(dname, mname, rseed, shuffle_params=None): assert (('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('tab' in mname) or ('node' in mname)) ################## # DATA ################## # Load data we're using (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname]( split_seed=rseed) #,**kwargs) # If we're using PACT we need some of the extra (redundant) features that were unused in our study if mname == 'pact': (Xtrain, ytrain), (Xvalid, yvalid), (Xtest, ytest), costs, groups, extras = load_ed( name=config.ED_NAME, costtype=config.ED_COSTTYPE, drop_redundant=False, split_seed=rseed) # If we're using a non-GBM AI method, we need to impute NaNs and scale # Don't do this if using ICU data because we're using a Pipeline in that case # that handle sthis stuff if ('linear' in mname or 'nn' in mname or 'cwcf' in mname or 'tab' in mname or 'node' in mname) and (dname != 'icu'): imputer = impute.SimpleImputer() scaler = preprocessing.StandardScaler() Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain)) Xvalid_np = scaler.transform(imputer.transform(Xvalid)) Xtest_np = scaler.transform(imputer.transform(Xtest)) for df, npy in zip([Xtrain, Xvalid, Xtest], [Xtrain_np, Xvalid_np, Xtest_np]): df.iloc[:] = npy # Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip( # [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])] # else: # (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw # Concatenated data for post-tuning Xtv = pd.concat([Xtrain, Xvalid]) ytv = np.hstack((ytrain, yvalid)) # Grouped costs for datasets tht feature it unique_costs = np.array([ costs[groups == g].mean() for g in np.unique(groups) ]) if (dname == 'outpatient') or ( dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs ################## # PARAMETER TUNING ################## # If we've precomputed best parameters, just load those if TUNING == 'LOAD' and ( ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('tab' in mname)): loadname = 'gbmsage' if mname == 'cegb' else mname with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w: model = pickle.load(w) # Otherwise do some parameter tuning else: # Tune GBM if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): model = tune(Xtrain, Xvalid, ytrain, yvalid) # Linear model needs onehotencoding pipeline if we're doing ICU elif ('linear' in mname): if (dname == 'icu'): model = lintune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_linear_model)) else: model = lintune(Xtrain, Xvalid, ytrain, yvalid) # NN model needs onehotencoding pipeline if we're doing ICU elif 'nn' in mname: if (dname == 'icu'): model = tftune(Xtrain, Xvalid, ytrain, yvalid, mfunc=icu_preprocessing(get_tf_model), return_extras=False) else: model = tftune(Xtrain, Xvalid, ytrain, yvalid, return_extras=False) elif 'node' in mname: model = nodetune(Xtrain, Xvalid, ytrain, yvalid, mfunc=(icu_preprocessing(NodeClassifier) if dname == 'icu' else NodeClassifier)) if dname != 'icu': bst = NodeClassifier(**model) bst.fit(Xtrain, ytrain, eval_set=(Xvalid, yvalid)) iXtest = bst.dataset.transform(Xtest) preds = bst.predict_proba(iXtest)[:, 1] score = roc_auc_score(ytest, preds) model['test_score'] = score elif ('tab' in mname): cat_name_map = { 'trauma': [ 'agencylevelfromscene', 'agencymodefromscene', 'ageunits', 'causecode', 'ethnicity', 'formfromscene', 'race', 'residencestate', 'scenedestinationreason', 'scenerespassisted', 'sex' ] } cat_idx_map = { 'trauma': [ i for i, c in enumerate(Xtrain.columns) if c in cat_name_map['trauma'] ] } cat_dim_map = { 'trauma': [ Xtrain[c].unique().shape[0] for i, c in enumerate(Xtrain.columns) if c in cat_name_map['trauma'] ] } if (dname == 'icu'): model = tabtune(Xtrain.values, Xvalid.values, ytrain, yvalid, mfunc=icu_preprocessing(get_linear_model)) else: model = tabtune(Xtrain.values, Xvalid.values, ytrain, yvalid, cat_idxs=cat_name_map.get(dname, []), cat_dims=cat_dim_map.get(dname, []), cat_emb_dim=2, return_score=True) # If we indicated we want to save the model, do so if TUNING == 'SAVE' and (('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or ('linear' in mname) or ('nn' in mname) or ('tab' in mname) or ('node' in mname)): with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w: pickle.dump(model, w) exit() ################## # Setup for CoAI ################## # Instantiate predictive models if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): bst = lgb.LGBMClassifier(**model) elif 'linear' in mname: bst = icu_preprocessing(FastLinearClassifier)( **model) if dname == 'icu' else FastLinearClassifier(**model) elif 'nn' in mname: bst = icu_preprocessing(get_fast_keras)( **model) if dname == 'icu' else get_fast_keras(**model) # Get our explainer (using SAGE entirely now, shap is old & may not work perfectly) if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')): exp = labelless_sage_wrapper(imputetype='marginal', refsize=64, batch_size=32, wrap_categorical=(dname == 'icu')) elif mname == 'gbmshap': exp = OneDimExplainer elif mname == 'linearshap': exp = get_pipeline_explainer(LinearExplainer) # Prepare to shuffle costs if required if shuffle_params is not None: if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)): costs, shuffle_costs = cost_pair(-shuffle_params[0], -shuffle_params[1], Xtrain) else: shuffle_costs = cost_swaps(costs, shuffle_params[0], shuffle_params[1]) # Pick thresholds for CoAI dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100) ##################### # Actually train/test ##################### if 'sage' in mname or 'shap' in mname: GRP = knapsack.GroupOptimizer(bst, exp, scale_ints=1000 * 100 if ('sage' in mname) else 1000) if 'nn' in mname: if dname == 'icu': GRP.fit(Xtv, ytv, costs, groups, dthresh, model__epochs=10, model__verbose=False) else: GRP.fit(Xtv, ytv, costs, groups, dthresh, epochs=10, verbose=False) else: GRP.fit(Xtv, ytv, costs, groups, dthresh) GRP.score_models_proba(Xtest, ytest, roc_auc_score) if shuffle_params: GRP.recalculate_costs(shuffle_costs) elif 'fixed' in mname: bst = bst.fit(Xtv, ytv) GRP = knapsack.FixedModelExactRetainer(bst, exp) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) elif 'impute' in mname: imputer = impute.IterativeImputer(random_state=0, estimator=linear_model.RidgeCV()) bst = bst.fit(Xtv, ytv) imputer.fit(Xtv) GRP = knapsack.FixedModelImputer(bst, exp, imputer) GRP.fit(Xtv, ytv, costs, dthresh) if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) elif mname == 'cegb': GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101)) GRP.fit(Xtv, ytv, costs) GRP.score_models_proba(Xtest, ytest, roc_auc_score) if dname == 'outpatient': GRP.recalculate_costs(costs, groups) if shuffle_params: GRP.recalculate_costs(shuffle_costs) elif mname == 'cwcf': ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)] print('Training CWCF...') lmbds = np.hstack([np.logspace(-14, 1, 16) for _ in range(2)]) GRP = cwcf.get_cwcf(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs, lmbds, gpus=list(range(8)), njobs=32, dirname=config.CWCF_TMPDIR, metric=roc_auc_score, difficulty=1000) print('Done') elif mname in ('aps', 'apacheiii', 'apacheiva'): strain, svalid, stest = aps_baselines() mpreds = stest mpreds = bootstrap_set(mpreds, rseed=rseed) preds = mpreds[mname] score = roc_auc_score(ytest, preds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(preds) elif mname in ('qsofa'): qtest = qsofa_score() qpreds = bootstrap_set(qtest, rseed=rseed) score = roc_auc_score(ytest, qpreds) cost = config.EICU_SCORE_COSTS[mname] GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score]) GRP.test_preds = np.array(qpreds) elif mname in ('pact'): cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest, costs) GRP = lambda x: x GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score) GRP.test_preds = np.array(preds) else: raise ValueError("Model name not found!") # Done return GRP
for item in RESULTS: management.print_tuples(item) else: RESULTS = management.list_values(DB_QUEUE, sys.argv[2], sys.argv[3], sys.argv[4]) for item in RESULTS: management.print_tuples(item) elif COMMAND == "tune": if len(sys.argv) != 5: print("Usage: \"" + sys.argv[0] + " tune <table> <benchmark> <scenario>\"") print( "Returns the optimums values in the table, for a given scenario.") else: management.print_tuples( tuning.tune(DB_QUEUE, sys.argv[2], sys.argv[3], sys.argv[4])) elif COMMAND == "quartiles": if len(sys.argv) != 5: print("Usage: \"" + sys.argv[0] + " quartiles <table> <benchmark> <scenario>\"") print( "Returns the quartiles for the data in the table, for a given scenario." ) else: management.print_tuples( statistical_analysis.get_quartiles(DB_QUEUE, sys.argv[2], sys.argv[3], sys.argv[4])) elif COMMAND == "histogram": if len(sys.argv) != 5: print("Usage: \"" + sys.argv[0] + " histogram <table> <benchmark> <scenario>\"")