Exemple #1
0
 def explore(self, X, y):
     path = os.path.join(self.path, self.solver, self.activation)
     print '--MLP: explore'
     # specify parameters for exploration
     if self.solver == 'lbfgs':
         param_dist = {
             'alpha': Log10Flat(-10, -0.001),
             'tol': Log10Flat(-10, -0.001),
         }
         pass
     elif self.solver == 'adam':
         param_dist = {
             'alpha': Log10Flat(-10, -0.001),
             'tol': Log10Flat(-10, -0.001),
             'beta_1': Log10Flat(-10, -0.001),
             'beta_2': Log10Flat(-10, -0.001),
         }
         pass
     else:
         raise KeyError("explore solver '%s' not implementd" % self.solver)
     # tune classifier
     tuning.tune(
         self,
         X,
         y,
         param_dist,
         path=path,
         jobs=1,  ## bug in MLPClassifier!!
         random_state=self.random_state)
     pass
Exemple #2
0
 def explore(self, X, y):
     path = self.path
     # specify parameters for exploration
     param_dist = {
         'reg_param': Log10Flat(-10, -0.001),
         'tol': Log10Flat(-10, -0.001),
     }
     # tune classifier
     tuning.tune(self,
                 X,
                 y,
                 param_dist,
                 path=path,
                 jobs=self.jobs,
                 random_state=self.random_state)
     pass
Exemple #3
0
 def explore(self, X, y):
     path = os.path.join(self.path, self.kernel,
                         'shrinking_%s' % str(self.shrinking))
     print '--SVC: explore'
     # specify parameters for exploration
     if self.kernel == 'rbf':
         param_dist = {
             'C': Log10Flat(-10, -0.001),
             'gamma': Log10Flat(-10, -0.001),
         }
         pass
     else:
         raise KeyError("explore unknown kernel '%s' " % self.kernel)
     # tune classifier
     tuning.tune(self,
                 X,
                 y,
                 param_dist,
                 path=path,
                 jobs=self.jobs,
                 random_state=self.random_state)
     pass
Exemple #4
0
 def explore(self, X, y):
     # generate unique path dependent on core algorithm
     path = os.path.join(self.path, self.solver)
     print "--LDA: explore"
     # specify parameters for exploration
     if self.solver == 'svd':
         param_dist = {'tol': Log10Flat(-10, -0.001)}
         pass
     else:
         param_dist = {
             'shrinkage': Log10Flat(-10, -0.001),
         }
         pass
     # tune classifier
     tuning.tune(self,
                 X,
                 y,
                 param_dist,
                 path=path,
                 jobs=self.jobs,
                 random_state=self.random_state)
     pass
Exemple #5
0
def train(rseed):
    (Xtrain, ytrain), (Xvalid,
                       yvalid), (Xtest,
                                 ytest), costs, groups, extras = load_eicu(
                                     split_seed=rseed)

    strain, svalid, stest = [
        s['apacheiva'].values for s in aps_baselines(split_seed=rseed)
    ]  #[extras['apacheiva'][k] for k in ['train','valid','test']]

    Xtv = pd.concat([Xtrain, Xvalid])
    ytv = np.hstack((ytrain, yvalid))
    stv = np.hstack((strain, svalid))

    model = tune(Xtrain,
                 Xvalid,
                 strain,
                 svalid,
                 mtype=prob_regressor,
                 predfunc=lambda m, X: m.predict(X),
                 scorefunc=lambda y, p: -soft_xent(y, p))

    # Print performance
    vbst = prob_regressor(**model)
    vbst.fit(Xtrain, strain)
    print(
        f'Rank correlation: {stats.spearmanr(stest,vbst.predict(Xtest)).correlation:.3f}'
    )

    bst = prob_regressor(**model)
    bst.fit(Xtv, stv)
    exp = OneDimExplainer(bst)
    shaps = exp.shap_values(Xtv)
    global_importance = np.abs(shaps).mean(0)
    np.save(
        f'eicu_importance_{rseed}.npy',
        np.vstack(
            (np.array(Xtrain.columns).astype('object'), global_importance)))
Exemple #6
0
def train(dname, mname, rseed, shuffle_params=None):
    #     ICU preprocessigng is now in its own function
    #     mtype = MTYPES[mname]
    #     kwargs = {}
    #     if dname=='icu' and ('linear' in mname or 'nn' in mname or 'cwcf' in mname): kwargs['onehot']=True

    #     CWCF now runs in parallel across several GPUs
    #     if mname=='cwcf' and 'CUDA_VISIBLE_DEVICES' not in os.environ:
    #         ngpu = len(tf.config.list_physical_devices('GPU'))
    #         cur_gpu = rseed%ngpu if rseed is not None else 0
    #         os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
    #         os.environ["CUDA_VISIBLE_DEVICES"]=str(cur_gpu)

    ##################
    # DATA
    ##################
    # Load data we're using
    (Xtrain,
     ytrain), (Xvalid,
               yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname](
                   split_seed=rseed)  #,**kwargs)

    # If we're using PACT we need some of the extra (redundant) features that were unused in our study
    if mname == 'pact':
        (Xtrain, ytrain), (Xvalid,
                           yvalid), (Xtest,
                                     ytest), costs, groups, extras = load_ed(
                                         name=config.ED_NAME,
                                         costtype=config.ED_COSTTYPE,
                                         drop_redundant=False,
                                         split_seed=rseed)
    #     print([(n,c) for n,c in zip(Xtrain.columns,costs) if c>0.01])

    #     Xtrain_raw,Xvalid_raw,Xtest_raw = Xtrain,Xvalid,Xtest

    #     For bootstrapping, we don't do this anymore and do train/test splits instead
    #     Xtrain_raw, ytrain = bootstrap_set(Xtrain,ytrain,rseed=rseed)
    #     Xvalid_raw, yvalid = bootstrap_set(Xvalid,yvalid,rseed=rseed)
    #     Xtest_raw, ytest = bootstrap_set(Xtest,ytest,rseed=rseed)

    # If we're using a non-GBM AI method, we need to impute NaNs and scale
    # Don't do this if using ICU data because we're using a Pipeline in that case
    # that handles this stuff
    if ('linear' in mname or 'nn' in mname or 'cwcf' in mname
            or 'node' in mname) and (dname != 'icu'):
        imputer = impute.SimpleImputer()
        scaler = preprocessing.StandardScaler()
        Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain))
        Xvalid_np = scaler.transform(imputer.transform(Xvalid))
        Xtest_np = scaler.transform(imputer.transform(Xtest))

        for df, npy in zip([Xtrain, Xvalid, Xtest],
                           [Xtrain_np, Xvalid_np, Xtest_np]):
            df.iloc[:] = npy

#             Hackier code for preprocessing features, can probably remove
#         Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip(
#             [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])]
#     else:
#         (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw

# Concatenated data for training cost-aware models after tuning
    Xtv = pd.concat([Xtrain, Xvalid])
    ytv = np.hstack((ytrain, yvalid))

    # Grouped costs for datasets tht feature it
    # Outpatient dataset
    # Or linear/NN on ICU (one-hot encoding of admission dx)
    unique_costs = np.array([
        costs[groups == g].mean() for g in np.unique(groups)
    ]) if (dname == 'outpatient') or (
        dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs

    ##################
    # PARAMETER TUNING
    ##################
    # If we've precomputed best parameters, just load those
    if TUNING == 'LOAD' and (('gbm' in mname) or
                             (mname in ('fixedmodel', 'imputemodel')) or
                             ('linear' in mname) or ('nn' in mname) or
                             ('cegb' in mname)):
        loadname = 'gbmsage' if ((mname in ('fixedmodel', 'imputemodel')) or
                                 ('cegb' in mname) or
                                 ('gbmsage' in mname)) else mname
        with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w:
            model = pickle.load(w)
    # Otherwise do some parameter tuning
    else:
        # Tune GBM
        if ('gbm' in mname) or (mname
                                in ('cegb', 'fixedmodel', 'imputemodel')):
            model = tune(Xtrain, Xvalid, ytrain, yvalid)
        # Linear model needs onehotencoding pipeline if we're doing ICU
        elif ('linear' in mname):
            if (dname == 'icu'):
                model = lintune(Xtrain,
                                Xvalid,
                                ytrain,
                                yvalid,
                                mfunc=icu_preprocessing(get_linear_model))
            else:
                model = lintune(Xtrain, Xvalid, ytrain, yvalid)
        # NN model needs onehotencoding pipeline if we're doing ICU
        elif 'nn' in mname:
            if (dname == 'icu'):
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               mfunc=icu_preprocessing(get_tf_model),
                               return_extras=False)
            else:
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               return_extras=False)
        # NODE model doesn't need tuning
        elif 'node' in mname:
            model = {}
    # If we indicated we want to save the model, do so
#     print(model)
    if TUNING == 'SAVE' and (('gbm' in mname) or
                             (mname in ('cegb', 'fixedmodel', 'imputemodel'))
                             or ('linear' in mname) or ('nn' in mname)):
        with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w:
            pickle.dump(model, w)
            exit()

    # Limit number of jobs for processor-hungry models
    print(mname)
    if mname not in ('qsofa', 'aps', 'apacheiii', 'apacheiva'):
        if (('gbm' in mname) or ('cegb' in mname) or ('linear' in mname)
                or ('imputemodel' in mname)):
            model['n_jobs'] = 4 if dname == 'trauma' else 2
#             else:  model['n_jobs']=10

##################
# Setup for CoAI
##################
# Instantiate predictive models
    if ('gbm' in mname) or ('cegb' in mname) or (mname in ('fixedmodel',
                                                           'imputemodel')):
        bst = lgb.LGBMClassifier(**model)
    elif 'linear' in mname:
        bst = icu_preprocessing(FastLinearClassifier)(
            **model) if dname == 'icu' else FastLinearClassifier(**model)
    elif 'nn' in mname:
        bst = icu_preprocessing(get_fast_keras)(
            **model) if dname == 'icu' else get_fast_keras(**model)
    elif 'node' in mname:
        bst = icu_preprocessing(NodeClassifier)(
            experiment_name=f'trauma{rseed}', **
            model) if dname == 'icu' else NodeClassifier(
                experiment_name=f'trauma{rseed}', **model)

    # Get our explainer (using SAGE entirely now, shap code is old & may not work perfectly)
    if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')):
        #sage_params={'imputetype':'default'}
        #if 'gbm' in mname: sage_params={'imputetype':'marginal'}

        # SAGE explainer. N_permutations set super low for NODE bc we're
        # just testing it right now
        exp = labelless_sage_wrapper(
            imputetype='marginal',
            refsize=64,
            batch_size=32,
            wrap_categorical=(dname == 'icu'),
            n_permutations=(128 if 'node' in mname else None))

#         NODE debugging line
#         print(dict(imputetype=('default' if 'node' in mname else 'marginal'),refsize=(1 if 'node' in mname else 64)))

#     Mostly deprecated
    elif mname == 'gbmshap':
        exp = OneDimExplainer
    elif mname == 'linearshap':
        exp = get_pipeline_explainer(LinearExplainer)

    # Prepare to perturb costs if required (robustness experiments)
    if shuffle_params is not None:
        # Negative numbers indicate individiual robustness
        if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)):
            costs, shuffle_costs = cost_pair(-shuffle_params[0],
                                             -shuffle_params[1], Xtrain)
        # Positive indicate swap robustness - # swaps and seed
        else:
            shuffle_costs = cost_swaps(costs, shuffle_params[0],
                                       shuffle_params[1])
    # Pick thresholds for CoAI
    dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100)

    #####################
    # Actually train/test
    #####################
    if 'sage' in mname or 'shap' in mname:
        # Wrap model with CoAI
        if 'greedy' in mname:
            GRP = knapsack.GroupGreedy(bst, exp)
        else:
            GRP = knapsack.GroupOptimizer(bst,
                                          exp,
                                          scale_ints=1000 * 100 if
                                          ('sage' in mname) else 1000)
        # NN needs preprocessing pipeline if ICU, also pass # epochs, verbosity
        if 'nn' in mname:
            if dname == 'icu':
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        model__epochs=10,
                        model__verbose=False)
            else:
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        epochs=10,
                        verbose=False)
        # NODE needs preprocessing for ICU.
        # Also requires eval set for stopping time
        # Current max_iter is short for prototyping
        elif 'node' in mname:
            dthresh = np.linspace(0, np.sum(unique_costs) + 1, 10)
            if dname == 'icu':
                GRP.fit(Xtrain,
                        ytrain,
                        costs,
                        groups,
                        dthresh,
                        model__eval_set=(Xvalid, yvalid),
                        model__max_iter=15)
            else:
                GRP.fit(Xtrain,
                        ytrain,
                        costs,
                        groups,
                        dthresh,
                        eval_set=(Xvalid, yvalid),
                        max_iter=15)
        # All other CoAI methods get a standardized fit process
        else:
            GRP.fit(Xtv, ytv, costs, groups, dthresh)
        # Evaluate CoAI models
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        # If costs get shuffled, each model's deployment cost will change
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)

    # Impute-CoAI with mean imputation
    elif 'fixed' in mname:
        bst = bst.fit(Xtv, ytv)
        GRP = knapsack.FixedModelExactRetainer(bst, exp)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
    # Impute-CoAI with model-based imputation (IterativeImputer)
    elif 'impute' in mname:
        imputer = impute.IterativeImputer(random_state=0,
                                          estimator=linear_model.RidgeCV())
        bst = bst.fit(Xtv, ytv)
        imputer.fit(Xtv)
        GRP = knapsack.FixedModelImputer(bst, exp, imputer)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
#     GRP.fit(Xtv,ytv,costs,groups,dthresh) if mname=='default' else GRP.fit(Xtv,ytv,costs,dthresh)
# CEGB doesn't use an explainer
    elif ('cegb' in mname):
        GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101))
        GRP.fit(Xtv, ytv, costs, groups=(groups if 'group' in mname else None))
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        # Account for grouped costs if in outpatient data
        if (dname == 'outpatient'): GRP.recalculate_costs(costs, groups)
        # Account for any cost perturbations
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)
    elif ('cwcf' in mname):
        # Lots of preprocessing if using ICU data to encode categoricals
        # as ordinal ints (save memory, handle groups, etc)
        if dname == 'icu':
            types = Xtrain.dtypes
            for col in Xtrain.columns:
                if str(types[col]) == 'category':
                    l_enc = preprocessing.OrdinalEncoder(
                        handle_unknown='use_encoded_value',
                        unknown_value=np.nan)
                    for df in [Xtrain, Xvalid, Xtest]:
                        if 'UNK' not in df[col].cat.categories:
                            df[col].cat.add_categories(['UNK'], inplace=True)
                        df[col].fillna('UNK', inplace=True)
                    Xtrain[col] = l_enc.fit_transform(
                        np.array(Xtrain[col]).reshape(-1, 1))
                    Xvalid[col] = l_enc.transform(
                        np.array(Xvalid[col]).reshape(-1, 1))
                    Xtest[col] = l_enc.transform(
                        np.array(Xtest[col]).reshape(-1, 1))

#             Old mode imputation code, better now (broken by dtype)
#         for df in [Xtrain,Xvalid,Xtest]:
#             if df[col].isna().any():
#                 df[col][df[col].isna()] = Xtrain[col].mode().iloc[0]
#                     Xtrain[col] = Xtrain[col].fillna(Xtrain[col].mode().iloc[0])
#                     Xvalid[col] = Xvalid[col].fillna(Xtrain[col].mode().iloc[0])
#                     Xtest[col] = Xtest[col].fillna(Xtrain[col].mode().iloc[0])
                elif str(types[col]) == 'int64':
                    Xtrain[col].fillna(Xtrain[col].mode(), inplace=True)
                    Xvalid[col].fillna(Xtrain[col].mode(), inplace=True)
                    Xtest[col].fillna(Xtrain[col].mode(), inplace=True)
                else:
                    Xtrain[col].fillna(Xtrain[col].mean(), inplace=True)
                    Xvalid[col].fillna(Xtrain[col].mean(), inplace=True)
                    Xtest[col].fillna(Xtrain[col].mean(), inplace=True)

        # CWCF only takes nparrays for labels
        ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)]
        print('Training CWCF...'
              )  # So we know when jobs get farmed out to other processes
        # Used to turn "groups" down to 6 for outpatient just to prototype group support
        if 'lagrange' in mname:
            data_lmbds = {
                'trauma': np.linspace(0, np.sum(unique_costs), 17)[1:],
                'icu': np.linspace(0, np.sum(unique_costs), 17)[1:],
                'outpatient': np.linspace(0, np.sum(unique_costs), 17)[1:]
            }
        else:
            data_lmbds = {
                'trauma': np.logspace(-14, 1, 16),
                'icu': np.logspace(-14, 1, 16),
                'outpatient': np.logspace(-14, 1, 16)
            }
        # This is usually range(2) to get some stability over reps -- doesn't matter as much for outpatient
        # Can turn down to 1 when prototyping
        lmbds = np.hstack([data_lmbds[dname] for _ in range(2)])
        # Old single threaded mode
        #         GRP = cwcf.CWCFClassifier(costs=costs,dirname=config.CWCF_TMPDIR)
        #         GRP.fit(Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest)
        #         print([x.shape for x in (Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest,costs,lmbds)])

        # Run CWCF - groups argument does experimental groups handling (not working yet)
        # More jobs (even more than GPUs) can be used - gets you through the lambda list faster
        # Set up right now for L3 gpus 1-6.
        GRP = cwcf.get_cwcf(Xtrain,
                            Xvalid,
                            Xtest,
                            ytrain,
                            yvalid,
                            ytest,
                            costs,
                            lmbds,
                            gpus=np.random.permutation(8),
                            njobs=16,
                            dirname=config.CWCF_TMPDIR,
                            lagrange=('lagrange' in mname),
                            metric=roc_auc_score,
                            difficulty=1000,
                            groups=(groups if 'group' in mname else None))
        print('Done')  # Done with external process run
    # ICU baselines
    elif mname in ('aps', 'apacheiii', 'apacheiva'):
        strain, svalid, stest = aps_baselines(split_seed=rseed)
        mpreds = stest
        #         mpreds = bootstrap_set(mpreds,rseed=rseed)
        preds = mpreds[mname]
        score = roc_auc_score(ytest, preds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(preds)
    elif mname in ('qsofa'):
        qtest = qsofa_score(split_seed=rseed)
        qpreds = qtest  #bootstrap_set(qtest,rseed=rseed)
        score = roc_auc_score(ytest, qpreds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(qpreds)
    # Trauma baseline (PACT)
    # Should ignore the resulting cost for now and just use
    # the hand-calculated one
    elif mname in ('pact'):
        cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest,
                                                    ytrain, yvalid, ytest,
                                                    costs)
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score)
        GRP.test_preds = np.array(preds)
    else:
        raise ValueError("Model name not found!")

    # Done
    return GRP  #(GRP.model_costs, GRP.model_scores)
Exemple #7
0
def train(dname, mname, rseed, shuffle_params=None):
    assert (('gbm' in mname)
            or (mname in ('cegb', 'fixedmodel', 'imputemodel'))
            or ('linear' in mname) or ('nn' in mname) or ('tab' in mname)
            or ('node' in mname))

    ##################
    # DATA
    ##################
    # Load data we're using
    (Xtrain,
     ytrain), (Xvalid,
               yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname](
                   split_seed=rseed)  #,**kwargs)

    # If we're using PACT we need some of the extra (redundant) features that were unused in our study
    if mname == 'pact':
        (Xtrain, ytrain), (Xvalid,
                           yvalid), (Xtest,
                                     ytest), costs, groups, extras = load_ed(
                                         name=config.ED_NAME,
                                         costtype=config.ED_COSTTYPE,
                                         drop_redundant=False,
                                         split_seed=rseed)

    # If we're using a non-GBM AI method, we need to impute NaNs and scale
    # Don't do this if using ICU data because we're using a Pipeline in that case
    # that handle sthis stuff
    if ('linear' in mname or 'nn' in mname or 'cwcf' in mname or 'tab' in mname
            or 'node' in mname) and (dname != 'icu'):
        imputer = impute.SimpleImputer()
        scaler = preprocessing.StandardScaler()
        Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain))
        Xvalid_np = scaler.transform(imputer.transform(Xvalid))
        Xtest_np = scaler.transform(imputer.transform(Xtest))

        for df, npy in zip([Xtrain, Xvalid, Xtest],
                           [Xtrain_np, Xvalid_np, Xtest_np]):
            df.iloc[:] = npy
#         Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip(
#             [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])]
#     else:
#         (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw

# Concatenated data for post-tuning
    Xtv = pd.concat([Xtrain, Xvalid])
    ytv = np.hstack((ytrain, yvalid))

    # Grouped costs for datasets tht feature it
    unique_costs = np.array([
        costs[groups == g].mean() for g in np.unique(groups)
    ]) if (dname == 'outpatient') or (
        dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs

    ##################
    # PARAMETER TUNING
    ##################
    # If we've precomputed best parameters, just load those
    if TUNING == 'LOAD' and (
        ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or
        ('linear' in mname) or ('nn' in mname) or ('tab' in mname)):
        loadname = 'gbmsage' if mname == 'cegb' else mname
        with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w:
            model = pickle.load(w)
    # Otherwise do some parameter tuning
    else:
        # Tune GBM
        if ('gbm' in mname) or (mname
                                in ('cegb', 'fixedmodel', 'imputemodel')):
            model = tune(Xtrain, Xvalid, ytrain, yvalid)
        # Linear model needs onehotencoding pipeline if we're doing ICU
        elif ('linear' in mname):
            if (dname == 'icu'):
                model = lintune(Xtrain,
                                Xvalid,
                                ytrain,
                                yvalid,
                                mfunc=icu_preprocessing(get_linear_model))
            else:
                model = lintune(Xtrain, Xvalid, ytrain, yvalid)
        # NN model needs onehotencoding pipeline if we're doing ICU
        elif 'nn' in mname:
            if (dname == 'icu'):
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               mfunc=icu_preprocessing(get_tf_model),
                               return_extras=False)
            else:
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               return_extras=False)
        elif 'node' in mname:
            model = nodetune(Xtrain,
                             Xvalid,
                             ytrain,
                             yvalid,
                             mfunc=(icu_preprocessing(NodeClassifier)
                                    if dname == 'icu' else NodeClassifier))
            if dname != 'icu':
                bst = NodeClassifier(**model)
                bst.fit(Xtrain, ytrain, eval_set=(Xvalid, yvalid))
                iXtest = bst.dataset.transform(Xtest)
                preds = bst.predict_proba(iXtest)[:, 1]
                score = roc_auc_score(ytest, preds)
                model['test_score'] = score
        elif ('tab' in mname):
            cat_name_map = {
                'trauma': [
                    'agencylevelfromscene', 'agencymodefromscene', 'ageunits',
                    'causecode', 'ethnicity', 'formfromscene', 'race',
                    'residencestate', 'scenedestinationreason',
                    'scenerespassisted', 'sex'
                ]
            }
            cat_idx_map = {
                'trauma': [
                    i for i, c in enumerate(Xtrain.columns)
                    if c in cat_name_map['trauma']
                ]
            }
            cat_dim_map = {
                'trauma': [
                    Xtrain[c].unique().shape[0]
                    for i, c in enumerate(Xtrain.columns)
                    if c in cat_name_map['trauma']
                ]
            }
            if (dname == 'icu'):
                model = tabtune(Xtrain.values,
                                Xvalid.values,
                                ytrain,
                                yvalid,
                                mfunc=icu_preprocessing(get_linear_model))
            else:
                model = tabtune(Xtrain.values,
                                Xvalid.values,
                                ytrain,
                                yvalid,
                                cat_idxs=cat_name_map.get(dname, []),
                                cat_dims=cat_dim_map.get(dname, []),
                                cat_emb_dim=2,
                                return_score=True)
    # If we indicated we want to save the model, do so
    if TUNING == 'SAVE' and (('gbm' in mname) or
                             (mname in ('cegb', 'fixedmodel', 'imputemodel'))
                             or ('linear' in mname) or ('nn' in mname) or
                             ('tab' in mname) or ('node' in mname)):
        with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w:
            pickle.dump(model, w)
            exit()

    ##################
    # Setup for CoAI
    ##################
    # Instantiate predictive models
    if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')):
        bst = lgb.LGBMClassifier(**model)
    elif 'linear' in mname:
        bst = icu_preprocessing(FastLinearClassifier)(
            **model) if dname == 'icu' else FastLinearClassifier(**model)
    elif 'nn' in mname:
        bst = icu_preprocessing(get_fast_keras)(
            **model) if dname == 'icu' else get_fast_keras(**model)

    # Get our explainer (using SAGE entirely now, shap is old & may not work perfectly)
    if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')):
        exp = labelless_sage_wrapper(imputetype='marginal',
                                     refsize=64,
                                     batch_size=32,
                                     wrap_categorical=(dname == 'icu'))
    elif mname == 'gbmshap':
        exp = OneDimExplainer
    elif mname == 'linearshap':
        exp = get_pipeline_explainer(LinearExplainer)

    # Prepare to shuffle costs if required
    if shuffle_params is not None:
        if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)):
            costs, shuffle_costs = cost_pair(-shuffle_params[0],
                                             -shuffle_params[1], Xtrain)
        else:
            shuffle_costs = cost_swaps(costs, shuffle_params[0],
                                       shuffle_params[1])
    # Pick thresholds for CoAI
    dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100)

    #####################
    # Actually train/test
    #####################
    if 'sage' in mname or 'shap' in mname:
        GRP = knapsack.GroupOptimizer(bst,
                                      exp,
                                      scale_ints=1000 * 100 if
                                      ('sage' in mname) else 1000)
        if 'nn' in mname:
            if dname == 'icu':
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        model__epochs=10,
                        model__verbose=False)
            else:
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        epochs=10,
                        verbose=False)
        else:
            GRP.fit(Xtv, ytv, costs, groups, dthresh)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)
    elif 'fixed' in mname:
        bst = bst.fit(Xtv, ytv)
        GRP = knapsack.FixedModelExactRetainer(bst, exp)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
    elif 'impute' in mname:
        imputer = impute.IterativeImputer(random_state=0,
                                          estimator=linear_model.RidgeCV())
        bst = bst.fit(Xtv, ytv)
        imputer.fit(Xtv)
        GRP = knapsack.FixedModelImputer(bst, exp, imputer)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
    elif mname == 'cegb':
        GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101))
        GRP.fit(Xtv, ytv, costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        if dname == 'outpatient': GRP.recalculate_costs(costs, groups)
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)
    elif mname == 'cwcf':
        ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)]
        print('Training CWCF...')
        lmbds = np.hstack([np.logspace(-14, 1, 16) for _ in range(2)])
        GRP = cwcf.get_cwcf(Xtrain,
                            Xvalid,
                            Xtest,
                            ytrain,
                            yvalid,
                            ytest,
                            costs,
                            lmbds,
                            gpus=list(range(8)),
                            njobs=32,
                            dirname=config.CWCF_TMPDIR,
                            metric=roc_auc_score,
                            difficulty=1000)
        print('Done')
    elif mname in ('aps', 'apacheiii', 'apacheiva'):
        strain, svalid, stest = aps_baselines()
        mpreds = stest
        mpreds = bootstrap_set(mpreds, rseed=rseed)
        preds = mpreds[mname]
        score = roc_auc_score(ytest, preds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(preds)
    elif mname in ('qsofa'):
        qtest = qsofa_score()
        qpreds = bootstrap_set(qtest, rseed=rseed)
        score = roc_auc_score(ytest, qpreds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(qpreds)
    elif mname in ('pact'):
        cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest,
                                                    ytrain, yvalid, ytest,
                                                    costs)
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score)
        GRP.test_preds = np.array(preds)
    else:
        raise ValueError("Model name not found!")

    # Done
    return GRP
Exemple #8
0
        for item in RESULTS:
            management.print_tuples(item)
    else:
        RESULTS = management.list_values(DB_QUEUE, sys.argv[2], sys.argv[3],
                                         sys.argv[4])
        for item in RESULTS:
            management.print_tuples(item)
elif COMMAND == "tune":
    if len(sys.argv) != 5:
        print("Usage: \"" + sys.argv[0] +
              " tune <table> <benchmark> <scenario>\"")
        print(
            "Returns the optimums values in the table, for a given scenario.")
    else:
        management.print_tuples(
            tuning.tune(DB_QUEUE, sys.argv[2], sys.argv[3], sys.argv[4]))
elif COMMAND == "quartiles":
    if len(sys.argv) != 5:
        print("Usage: \"" + sys.argv[0] +
              " quartiles <table> <benchmark> <scenario>\"")
        print(
            "Returns the quartiles for the data in the table, for a given scenario."
        )
    else:
        management.print_tuples(
            statistical_analysis.get_quartiles(DB_QUEUE, sys.argv[2],
                                               sys.argv[3], sys.argv[4]))
elif COMMAND == "histogram":
    if len(sys.argv) != 5:
        print("Usage: \"" + sys.argv[0] +
              " histogram <table> <benchmark> <scenario>\"")