def multiple_imputation(train_df, test_df, add_indicator=False, estimator=None,\
         imputation_order='ascending', initial_strategy='mean',\
         max_iter=10, missing_values=np.nan,\
         n_nearest_features=None, random_state=None,\
         sample_posterior=False):
    '''
    This function is used to provide multivariate imputation method to fill in the
    missing values of a given DataFrame
    Inputs:
        df: DataFrame
        missing_values: indicator of the missing value in the data
        initial_strategy: the initial strategy used to impute the value
        n_nearest_features: select n features used in the multivariate method which
                            have n highest correlation with the column contains missing
                            values.
    Returns: dataframe with missing values filled
    '''
    imp_model = impute.IterativeImputer(add_indicator=add_indicator, \
                    estimator=estimator, imputation_order='ascending',\
                    initial_strategy='mean', max_iter=10, \
                    missing_values=np.nan, n_nearest_features=None,\
                    random_state=None, sample_posterior=False)
    
    columns = list(df.columns)
    train_df = imp_model.fit_transform(train_df)
    test_df = imp_model.transform(test_df)

    train_df = pd.DataFrame(train_df, columns=columns)
    test_df = pd.DataFrame(test_df, columns=columns)
    
    return train_df, test_df
Exemple #2
0
def data_pipeline(train: pd.DataFrame,
                  test: pd.DataFrame,
                  random_state=42,
                  **kwargs) -> tuple:
    """Fits a transformer pipeline on the train data, then transform both the 
    train and test data. This makes sure that the test data is not contaminated.

    Args:
        train (pd.DataFrame): Train data.
        test (pd.DataFrame): Test data, never to be inspected.
        random_state (int, optional): Not in use. Defaults to 42.

    Returns:
        tuple: Returns the transformed train and test data.
    """
    transformer_pipeline = make_pipeline(
        # remove_outliers(),
        # impute.SimpleImputer(add_indicator=False),
        impute.IterativeImputer(random_state=random_state, **kwargs),
        StandardScaler(),
    )
    # transformer_pipeline.fit(train)
    train = pd.DataFrame(transformer_pipeline.fit_transform(train))
    test = pd.DataFrame(transformer_pipeline.transform(test))
    return train, test
Exemple #3
0
def multiple_imputation(train_df, test_df, continuous_columns, estimator=None,
                        max_iter=10, n_nearest_features=None):
    '''
    This function is used to provide multivariate imputation method to fill in the
    missing values of a given DataFrame
    Inputs:
        df: DataFrame
        missing_values: indicator of the missing value in the data
        initial_strategy: the initial strategy used to impute the value
        n_nearest_features: select n features used in the multivariate method which
                            have n highest correlation with the column contains missing
                            values.
    Returns: dataframe with missing values filled
    '''
    imp_model = impute.IterativeImputer(estimator=estimator,\
                                        max_iter=max_iter, missing_values=np.nan, 
                                        n_nearest_features=n_nearest_features)
    
    columns = list(continuous_columns)
    new_train_df = imp_model.fit_transform(train_df[columns])
    new_test_df = imp_model.transform(test_df[columns])

    train_df = train_df.drop([columns], axis=1)
    test_df = test_df.drop([columns], axis=1)

    train_df = train_df.join(pd.DataFrame(data=new_train_df, columns=columns))
    test_df = test_df.join(pd.DataFrame(data=new_test_df,columns=columns))
    
    return train_df, test_df
Exemple #4
0
def fill_missing_values(train_data, test_data):

    # Fill enmbarked column
    embarked_impouter = impute.SimpleImputer(missing_values=np.nan,
                                             strategy='most_frequent')

    train_data_filled = train_data.copy()
    test_data_filled = test_data.copy()

    train_data_filled[['embarked']] = embarked_impouter.fit_transform(
        train_data[['embarked']])
    test_data_filled[['embarked']] = embarked_impouter.fit_transform(
        test_data[['embarked']])

    # Delete cabin column
    train_data_filled = train_data_filled.drop(['cabin'], axis=1)
    test_data_filled = test_data_filled.drop(['cabin'], axis=1)

    # fare column fill
    fare_imputer = impute.IterativeImputer(missing_values=np.nan,
                                           random_state=42)
    train_data_filled[['fare', 'TicketClass']] = fare_imputer.fit_transform(
        train_data[['fare', 'TicketClass']])
    test_data_filled[['fare', 'TicketClass']] = fare_imputer.fit_transform(
        test_data[['fare', 'TicketClass']])

    # Age column fill
    # plt.figure("before fill")
    # plt.hist(train_data_filled['age'], bins=80)

    age_impouter = impute.IterativeImputer(missing_values=np.nan,
                                           random_state=42)

    train_data_filled[['age', 'parch', 'sibsp', 'fare', 'TicketClass'
                       ]] = age_impouter.fit_transform(train_data[[
                           'age', 'parch', 'sibsp', 'fare', 'TicketClass'
                       ]])
    test_data_filled[['age', 'parch', 'sibsp', 'fare',
                      'TicketClass']] = age_impouter.fit_transform(test_data[[
                          'age', 'parch', 'sibsp', 'fare', 'TicketClass'
                      ]])

    # plt.figure("after fill")
    # plt.hist(train_data_filled['age'], bins=80)
    # plt.show()

    return train_data_filled, test_data_filled
 def _model_impute(self):
     for col in self.target:
         m_impute = impute.IterativeImputer(estimator=self.estimator,
                                            random_state=42)
         m_impute.fit(self.df[col].values)
         self.output_df.loc[:, col] = m_impute.fit_transform(
             self.df[col].values)
     return self.output_df
Exemple #6
0
def fixData(trainFileName,
            testFileName,
            features,
            imputer="simple",
            strategy="mean"):

    print("Fixing Data\n")  #Read files into pandas array
    training_data = pd.read_csv(trainFileName)
    testing_data = pd.read_csv(testFileName)

    featuresForDummies = ["Embarked", "Sex"]

    trainSurvived = training_data["Survived"]
    passengerID = testing_data["PassengerId"]

    features2 = []
    for i in range(len(features)):
        features2.append(
            features[i])  #Appends feature selected to the features to use

    training_data = training_data[features2]
    testing_data = testing_data[features2]

    tr_data = pd.get_dummies(
        training_data,
        columns=featuresForDummies)  #Get dummies for required ones
    te_data = pd.get_dummies(testing_data, columns=featuresForDummies)

    if imputer.lower() == "simple":
        imp = impute.SimpleImputer(missing_values=np.NaN,
                                   strategy=strategy)  #Imputes data
    elif imputer.lower() == "knn":
        imp = impute.KNNImputer(missing_values=np.NaN)
    elif imputer.lower() == "iterative":
        imp = impute.IterativeImputer(missing_values=np.NaN,
                                      initial_strategy=strategy)
    else:
        print("You did not enter a correct imputation method.")
        print(
            "Correct imputation methods include: \"Simple\", \"KNN\", \"Iterative\""
        )

    imp.fit(te_data)
    dummied_test = imp.transform(te_data)  #Fits data

    imp.fit(tr_data)
    dummied_train = imp.transform(tr_data)

    return (dummied_test, dummied_train, trainSurvived, passengerID
            )  #Returns the completed arrays
Exemple #7
0
    def fit(self, X, y=None):

        X = X.copy()

        columns = X.columns.values
        indices = X.index

        #toto sme uz riesili v preprocessing notebooku - chceme, aby nam null hodnoty neinkrementovali encoding hodnoty v strede datasetu,
        #ale aby sme mali urcity range celociselnych hodnot, bez dier, ktore sa pouzije v imputerovi
        #je to klucove aj pri KNN imputerovi, aj pri Iterative imputerovi, lebo pri iterative pracujeme so ciselnymi hodnotami,
        #ktore su kludne aj desatinne, a teda nakoniec sa vysledok imputera rounduje
        #a pri knn sice pracujeme s celocislenymi cislami, no nakoniec imputuje sa priemer ziskany z danych
        #n-susedov, co znova moze byt desatinne cislo
        #takze, aby sme nahodou pri roundovani sa nedostali na encoding hodnotu, ktora patri null hodnote, tak
        #feedujeme danemu ordinal encodingu hned na zaciatku null hodnoty
        null_values = pd.DataFrame(index=pd.Index([-1]),
                                   columns=columns,
                                   data=[[np.nan
                                          for i in range(len(columns))]])
        X = pd.concat([null_values, X])

        self.ordinal_encoder = ce.ordinal.OrdinalEncoder(
            handle_missing="return_nan", handle_unknown="return_nan")
        X = self.ordinal_encoder.fit_transform(X)

        X = X[1:]

        if self.imputer_type == "knn":
            self.imputer = impute.KNNImputer()
            X = self.imputer.fit(X)

        elif self.imputer_type == "iterative":

            self.imputer = impute.IterativeImputer(
                max_iter=20,
                random_state=42,
                initial_strategy="most_frequent",
                min_value=X.min(),
                max_value=X.max())

            try:
                X = self.imputer.fit(X)
            except (ValueError, np.linalg.LinAlgError):
                print(
                    "Jeden error bol trapnuty, kedy funkcii vadili NaNs. Tento error je ale divny, lebo mu to vadi",
                    "len prvy krat, a potom to uz ide...")
                X = self.imputer.fit(X)

        return self
Exemple #8
0
def impute_fit_transform(train: pd.DataFrame,
                         test: pd.DataFrame,
                         random_state=42,
                         **kwargs) -> tuple:
    """Fits imputing on the train data, and then fits this both on the train
    and test data.

    Args:
        train (pd.DataFrame): Train data to be fitted and transformed
        test (pd.DataFrame): Test data to be transformed
        random_state (int, optional): Defaults to 42.

    Returns:
        tuple: [description]
    """
    imputer = impute.IterativeImputer(random_state=random_state, **kwargs)
    imputer = imputer.fit(train)
    train = pd.DataFrame(imputer.transform(train))
    test = pd.DataFrame(imputer.transform(test))
    return train, test
Exemple #9
0
def train(dname, mname, rseed, shuffle_params=None):
    #     ICU preprocessigng is now in its own function
    #     mtype = MTYPES[mname]
    #     kwargs = {}
    #     if dname=='icu' and ('linear' in mname or 'nn' in mname or 'cwcf' in mname): kwargs['onehot']=True

    #     CWCF now runs in parallel across several GPUs
    #     if mname=='cwcf' and 'CUDA_VISIBLE_DEVICES' not in os.environ:
    #         ngpu = len(tf.config.list_physical_devices('GPU'))
    #         cur_gpu = rseed%ngpu if rseed is not None else 0
    #         os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
    #         os.environ["CUDA_VISIBLE_DEVICES"]=str(cur_gpu)

    ##################
    # DATA
    ##################
    # Load data we're using
    (Xtrain,
     ytrain), (Xvalid,
               yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname](
                   split_seed=rseed)  #,**kwargs)

    # If we're using PACT we need some of the extra (redundant) features that were unused in our study
    if mname == 'pact':
        (Xtrain, ytrain), (Xvalid,
                           yvalid), (Xtest,
                                     ytest), costs, groups, extras = load_ed(
                                         name=config.ED_NAME,
                                         costtype=config.ED_COSTTYPE,
                                         drop_redundant=False,
                                         split_seed=rseed)
    #     print([(n,c) for n,c in zip(Xtrain.columns,costs) if c>0.01])

    #     Xtrain_raw,Xvalid_raw,Xtest_raw = Xtrain,Xvalid,Xtest

    #     For bootstrapping, we don't do this anymore and do train/test splits instead
    #     Xtrain_raw, ytrain = bootstrap_set(Xtrain,ytrain,rseed=rseed)
    #     Xvalid_raw, yvalid = bootstrap_set(Xvalid,yvalid,rseed=rseed)
    #     Xtest_raw, ytest = bootstrap_set(Xtest,ytest,rseed=rseed)

    # If we're using a non-GBM AI method, we need to impute NaNs and scale
    # Don't do this if using ICU data because we're using a Pipeline in that case
    # that handles this stuff
    if ('linear' in mname or 'nn' in mname or 'cwcf' in mname
            or 'node' in mname) and (dname != 'icu'):
        imputer = impute.SimpleImputer()
        scaler = preprocessing.StandardScaler()
        Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain))
        Xvalid_np = scaler.transform(imputer.transform(Xvalid))
        Xtest_np = scaler.transform(imputer.transform(Xtest))

        for df, npy in zip([Xtrain, Xvalid, Xtest],
                           [Xtrain_np, Xvalid_np, Xtest_np]):
            df.iloc[:] = npy

#             Hackier code for preprocessing features, can probably remove
#         Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip(
#             [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])]
#     else:
#         (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw

# Concatenated data for training cost-aware models after tuning
    Xtv = pd.concat([Xtrain, Xvalid])
    ytv = np.hstack((ytrain, yvalid))

    # Grouped costs for datasets tht feature it
    # Outpatient dataset
    # Or linear/NN on ICU (one-hot encoding of admission dx)
    unique_costs = np.array([
        costs[groups == g].mean() for g in np.unique(groups)
    ]) if (dname == 'outpatient') or (
        dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs

    ##################
    # PARAMETER TUNING
    ##################
    # If we've precomputed best parameters, just load those
    if TUNING == 'LOAD' and (('gbm' in mname) or
                             (mname in ('fixedmodel', 'imputemodel')) or
                             ('linear' in mname) or ('nn' in mname) or
                             ('cegb' in mname)):
        loadname = 'gbmsage' if ((mname in ('fixedmodel', 'imputemodel')) or
                                 ('cegb' in mname) or
                                 ('gbmsage' in mname)) else mname
        with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w:
            model = pickle.load(w)
    # Otherwise do some parameter tuning
    else:
        # Tune GBM
        if ('gbm' in mname) or (mname
                                in ('cegb', 'fixedmodel', 'imputemodel')):
            model = tune(Xtrain, Xvalid, ytrain, yvalid)
        # Linear model needs onehotencoding pipeline if we're doing ICU
        elif ('linear' in mname):
            if (dname == 'icu'):
                model = lintune(Xtrain,
                                Xvalid,
                                ytrain,
                                yvalid,
                                mfunc=icu_preprocessing(get_linear_model))
            else:
                model = lintune(Xtrain, Xvalid, ytrain, yvalid)
        # NN model needs onehotencoding pipeline if we're doing ICU
        elif 'nn' in mname:
            if (dname == 'icu'):
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               mfunc=icu_preprocessing(get_tf_model),
                               return_extras=False)
            else:
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               return_extras=False)
        # NODE model doesn't need tuning
        elif 'node' in mname:
            model = {}
    # If we indicated we want to save the model, do so
#     print(model)
    if TUNING == 'SAVE' and (('gbm' in mname) or
                             (mname in ('cegb', 'fixedmodel', 'imputemodel'))
                             or ('linear' in mname) or ('nn' in mname)):
        with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w:
            pickle.dump(model, w)
            exit()

    # Limit number of jobs for processor-hungry models
    print(mname)
    if mname not in ('qsofa', 'aps', 'apacheiii', 'apacheiva'):
        if (('gbm' in mname) or ('cegb' in mname) or ('linear' in mname)
                or ('imputemodel' in mname)):
            model['n_jobs'] = 4 if dname == 'trauma' else 2
#             else:  model['n_jobs']=10

##################
# Setup for CoAI
##################
# Instantiate predictive models
    if ('gbm' in mname) or ('cegb' in mname) or (mname in ('fixedmodel',
                                                           'imputemodel')):
        bst = lgb.LGBMClassifier(**model)
    elif 'linear' in mname:
        bst = icu_preprocessing(FastLinearClassifier)(
            **model) if dname == 'icu' else FastLinearClassifier(**model)
    elif 'nn' in mname:
        bst = icu_preprocessing(get_fast_keras)(
            **model) if dname == 'icu' else get_fast_keras(**model)
    elif 'node' in mname:
        bst = icu_preprocessing(NodeClassifier)(
            experiment_name=f'trauma{rseed}', **
            model) if dname == 'icu' else NodeClassifier(
                experiment_name=f'trauma{rseed}', **model)

    # Get our explainer (using SAGE entirely now, shap code is old & may not work perfectly)
    if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')):
        #sage_params={'imputetype':'default'}
        #if 'gbm' in mname: sage_params={'imputetype':'marginal'}

        # SAGE explainer. N_permutations set super low for NODE bc we're
        # just testing it right now
        exp = labelless_sage_wrapper(
            imputetype='marginal',
            refsize=64,
            batch_size=32,
            wrap_categorical=(dname == 'icu'),
            n_permutations=(128 if 'node' in mname else None))

#         NODE debugging line
#         print(dict(imputetype=('default' if 'node' in mname else 'marginal'),refsize=(1 if 'node' in mname else 64)))

#     Mostly deprecated
    elif mname == 'gbmshap':
        exp = OneDimExplainer
    elif mname == 'linearshap':
        exp = get_pipeline_explainer(LinearExplainer)

    # Prepare to perturb costs if required (robustness experiments)
    if shuffle_params is not None:
        # Negative numbers indicate individiual robustness
        if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)):
            costs, shuffle_costs = cost_pair(-shuffle_params[0],
                                             -shuffle_params[1], Xtrain)
        # Positive indicate swap robustness - # swaps and seed
        else:
            shuffle_costs = cost_swaps(costs, shuffle_params[0],
                                       shuffle_params[1])
    # Pick thresholds for CoAI
    dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100)

    #####################
    # Actually train/test
    #####################
    if 'sage' in mname or 'shap' in mname:
        # Wrap model with CoAI
        if 'greedy' in mname:
            GRP = knapsack.GroupGreedy(bst, exp)
        else:
            GRP = knapsack.GroupOptimizer(bst,
                                          exp,
                                          scale_ints=1000 * 100 if
                                          ('sage' in mname) else 1000)
        # NN needs preprocessing pipeline if ICU, also pass # epochs, verbosity
        if 'nn' in mname:
            if dname == 'icu':
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        model__epochs=10,
                        model__verbose=False)
            else:
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        epochs=10,
                        verbose=False)
        # NODE needs preprocessing for ICU.
        # Also requires eval set for stopping time
        # Current max_iter is short for prototyping
        elif 'node' in mname:
            dthresh = np.linspace(0, np.sum(unique_costs) + 1, 10)
            if dname == 'icu':
                GRP.fit(Xtrain,
                        ytrain,
                        costs,
                        groups,
                        dthresh,
                        model__eval_set=(Xvalid, yvalid),
                        model__max_iter=15)
            else:
                GRP.fit(Xtrain,
                        ytrain,
                        costs,
                        groups,
                        dthresh,
                        eval_set=(Xvalid, yvalid),
                        max_iter=15)
        # All other CoAI methods get a standardized fit process
        else:
            GRP.fit(Xtv, ytv, costs, groups, dthresh)
        # Evaluate CoAI models
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        # If costs get shuffled, each model's deployment cost will change
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)

    # Impute-CoAI with mean imputation
    elif 'fixed' in mname:
        bst = bst.fit(Xtv, ytv)
        GRP = knapsack.FixedModelExactRetainer(bst, exp)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
    # Impute-CoAI with model-based imputation (IterativeImputer)
    elif 'impute' in mname:
        imputer = impute.IterativeImputer(random_state=0,
                                          estimator=linear_model.RidgeCV())
        bst = bst.fit(Xtv, ytv)
        imputer.fit(Xtv)
        GRP = knapsack.FixedModelImputer(bst, exp, imputer)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
#     GRP.fit(Xtv,ytv,costs,groups,dthresh) if mname=='default' else GRP.fit(Xtv,ytv,costs,dthresh)
# CEGB doesn't use an explainer
    elif ('cegb' in mname):
        GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101))
        GRP.fit(Xtv, ytv, costs, groups=(groups if 'group' in mname else None))
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        # Account for grouped costs if in outpatient data
        if (dname == 'outpatient'): GRP.recalculate_costs(costs, groups)
        # Account for any cost perturbations
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)
    elif ('cwcf' in mname):
        # Lots of preprocessing if using ICU data to encode categoricals
        # as ordinal ints (save memory, handle groups, etc)
        if dname == 'icu':
            types = Xtrain.dtypes
            for col in Xtrain.columns:
                if str(types[col]) == 'category':
                    l_enc = preprocessing.OrdinalEncoder(
                        handle_unknown='use_encoded_value',
                        unknown_value=np.nan)
                    for df in [Xtrain, Xvalid, Xtest]:
                        if 'UNK' not in df[col].cat.categories:
                            df[col].cat.add_categories(['UNK'], inplace=True)
                        df[col].fillna('UNK', inplace=True)
                    Xtrain[col] = l_enc.fit_transform(
                        np.array(Xtrain[col]).reshape(-1, 1))
                    Xvalid[col] = l_enc.transform(
                        np.array(Xvalid[col]).reshape(-1, 1))
                    Xtest[col] = l_enc.transform(
                        np.array(Xtest[col]).reshape(-1, 1))

#             Old mode imputation code, better now (broken by dtype)
#         for df in [Xtrain,Xvalid,Xtest]:
#             if df[col].isna().any():
#                 df[col][df[col].isna()] = Xtrain[col].mode().iloc[0]
#                     Xtrain[col] = Xtrain[col].fillna(Xtrain[col].mode().iloc[0])
#                     Xvalid[col] = Xvalid[col].fillna(Xtrain[col].mode().iloc[0])
#                     Xtest[col] = Xtest[col].fillna(Xtrain[col].mode().iloc[0])
                elif str(types[col]) == 'int64':
                    Xtrain[col].fillna(Xtrain[col].mode(), inplace=True)
                    Xvalid[col].fillna(Xtrain[col].mode(), inplace=True)
                    Xtest[col].fillna(Xtrain[col].mode(), inplace=True)
                else:
                    Xtrain[col].fillna(Xtrain[col].mean(), inplace=True)
                    Xvalid[col].fillna(Xtrain[col].mean(), inplace=True)
                    Xtest[col].fillna(Xtrain[col].mean(), inplace=True)

        # CWCF only takes nparrays for labels
        ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)]
        print('Training CWCF...'
              )  # So we know when jobs get farmed out to other processes
        # Used to turn "groups" down to 6 for outpatient just to prototype group support
        if 'lagrange' in mname:
            data_lmbds = {
                'trauma': np.linspace(0, np.sum(unique_costs), 17)[1:],
                'icu': np.linspace(0, np.sum(unique_costs), 17)[1:],
                'outpatient': np.linspace(0, np.sum(unique_costs), 17)[1:]
            }
        else:
            data_lmbds = {
                'trauma': np.logspace(-14, 1, 16),
                'icu': np.logspace(-14, 1, 16),
                'outpatient': np.logspace(-14, 1, 16)
            }
        # This is usually range(2) to get some stability over reps -- doesn't matter as much for outpatient
        # Can turn down to 1 when prototyping
        lmbds = np.hstack([data_lmbds[dname] for _ in range(2)])
        # Old single threaded mode
        #         GRP = cwcf.CWCFClassifier(costs=costs,dirname=config.CWCF_TMPDIR)
        #         GRP.fit(Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest)
        #         print([x.shape for x in (Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest,costs,lmbds)])

        # Run CWCF - groups argument does experimental groups handling (not working yet)
        # More jobs (even more than GPUs) can be used - gets you through the lambda list faster
        # Set up right now for L3 gpus 1-6.
        GRP = cwcf.get_cwcf(Xtrain,
                            Xvalid,
                            Xtest,
                            ytrain,
                            yvalid,
                            ytest,
                            costs,
                            lmbds,
                            gpus=np.random.permutation(8),
                            njobs=16,
                            dirname=config.CWCF_TMPDIR,
                            lagrange=('lagrange' in mname),
                            metric=roc_auc_score,
                            difficulty=1000,
                            groups=(groups if 'group' in mname else None))
        print('Done')  # Done with external process run
    # ICU baselines
    elif mname in ('aps', 'apacheiii', 'apacheiva'):
        strain, svalid, stest = aps_baselines(split_seed=rseed)
        mpreds = stest
        #         mpreds = bootstrap_set(mpreds,rseed=rseed)
        preds = mpreds[mname]
        score = roc_auc_score(ytest, preds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(preds)
    elif mname in ('qsofa'):
        qtest = qsofa_score(split_seed=rseed)
        qpreds = qtest  #bootstrap_set(qtest,rseed=rseed)
        score = roc_auc_score(ytest, qpreds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(qpreds)
    # Trauma baseline (PACT)
    # Should ignore the resulting cost for now and just use
    # the hand-calculated one
    elif mname in ('pact'):
        cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest,
                                                    ytrain, yvalid, ytest,
                                                    costs)
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score)
        GRP.test_preds = np.array(preds)
    else:
        raise ValueError("Model name not found!")

    # Done
    return GRP  #(GRP.model_costs, GRP.model_scores)
Exemple #10
0
 def iterative_imputer(self, col, how = "mean", ):
     if how in self.strategies:
         imputer = impute.IterativeImputer(initial_strategy=how)
         self.impute(col, imputer)
Exemple #11
0
def main():
    # read in data
    xtrain = pd.read_csv('../X_train.csv',
                         index_col='id',
                         dtype={'id': np.int32})
    ytrain = pd.read_csv('../y_train.csv',
                         index_col='id',
                         dtype={'id': np.int32})
    xtest = pd.read_csv('../X_test.csv',
                        index_col='id',
                        dtype={'id': np.int32})

    lowerVar = 1e-8
    upperVar = 1e100
    upperCorr = 0.7
    stdFactor = 2.5
    imputer = 'simple'
    outfile = "prediction"
    norm = 'robust'

    # read parameters
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "h", ["uc=", "df=", "imp=", "of=", "help", "norm="])
    except getopt.GetoptError:
        help()
        sys.exit(2)

    for opt, arg in opts:
        if opt in {'-h', '--help'}:
            help()
            sys.exit()
        elif opt == '--uc':
            upperCorr = float(arg)
        elif opt == '--df':
            stdFactor = float(arg)
        elif opt == '--imp':
            imputer = arg
        elif opt == '--of':
            outfile = arg
        elif opt == '--norm':
            norm = arg

    print("Selected parameters:")
    print("  lower variance: {:e}".format(lowerVar))
    print("  upper variance: {:e}".format(upperVar))
    print("  upper covariance: {:e}".format(upperCorr))
    print("  std deviation factor: {:f}".format(stdFactor))
    print("  imputer: ", imputer)
    print("  output: ", outfile)
    print("  norm: ", norm)

    #------------------------------------------------------------------
    # impute
    if imputer == 'simple':
        imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
    elif imputer == 'iter':
        imp = impute.IterativeImputer(missing_values=np.nan,
                                      max_iter=50,
                                      n_nearest_features=20)

    imp.fit(xtrain)

    xtrain.loc[:, :] = imp.transform(xtrain)
    xtest.loc[:, :] = imp.transform(xtest)

    #--------------------------------------------------------------------
    # drop features because of variance
    thresholder = VarianceThreshold(threshold=lowerVar)
    thresholder.fit(xtrain)
    print("Will drop because of variance < {:e}: ".format(lowerVar))
    print(xtrain.columns[np.invert(thresholder.get_support())].tolist())

    xtrain2 = pd.DataFrame(
        data=thresholder.transform(xtrain),
        index=xtrain.index.tolist(),
        columns=xtrain.columns[thresholder.get_support()].tolist())
    xtest2 = pd.DataFrame(
        data=thresholder.transform(xtest),
        index=xtest.index.tolist(),
        columns=xtest.columns[thresholder.get_support()].tolist())

    # drop features with absurdly high variance
    var = xtrain2.var()

    tooHigh = var[var > upperVar].index.tolist()  # 4 features
    print("dropping because of variance > {:e}: ".format(upperVar))
    print(tooHigh)

    xtrain3 = xtrain2.drop(columns=tooHigh)
    xtest3 = xtest2.drop(columns=tooHigh)

    #----------------------------------------------------------------------
    # remove outliers
    scaler = RobustScaler()
    scaler.fit(xtrain3)

    xtrain3[xtrain3 > scaler.center_ + stdFactor * scaler.scale_] = np.nan
    xtrain3[xtrain3 < scaler.center_ - stdFactor * scaler.scale_] = np.nan

    xtest3[xtest3 > scaler.center_ + stdFactor * scaler.scale_] = np.nan
    xtest3[xtest3 < scaler.center_ - stdFactor * scaler.scale_] = np.nan

    if imputer == 'simple':
        imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean')
    elif imputer == 'iter':
        imp = impute.IterativeImputer(missing_values=np.nan,
                                      max_iter=50,
                                      n_nearest_features=20)

    imp.fit(xtrain3)

    xtrain3.loc[:, :] = imp.transform(xtrain3)
    xtest3.loc[:, :] = imp.transform(xtest3)

    #----------------------------------------------------------------------
    # normalize data
    if norm == 'robust':
        scaler = RobustScaler()
    elif norm == 'standard':
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()

    scaler.fit(xtrain3)

    xtrain3.loc[:, :] = scaler.transform(xtrain3)
    xtest3.loc[:, :] = scaler.transform(xtest3)

    # drop highly correlated features
    corr = xtrain3.corr().abs()
    corr_triu = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
    to_drop = [
        column for column in corr_triu.columns
        if any(corr_triu[column] > upperCorr)
    ]

    print("Will drop due to covariance > {:e}: ".format(upperCorr))
    print(to_drop)
    xtrain4 = xtrain3.drop(columns=to_drop)
    xtest4 = xtest3.drop(columns=to_drop)

    print("Using ElasticNet to determine features:")
    netCV = linear_model.ElasticNetCV(l1_ratio=[0.1, 0.3, 0.5, 0.75, 0.85],
                                      alphas=(0., 0.1, 0.2, 0.25, 0.3, 0.35,
                                              0.4),
                                      cv=5,
                                      n_jobs=2,
                                      max_iter=5e3)
    netCV.fit(xtrain4, ytrain.values.ravel())

    print("Selected nr of featrues: ", np.count_nonzero(netCV.coef_))
    print("Selected alpha: ", netCV.alpha_)
    print("Selected l1_ratio: ", netCV.l1_ratio_)
    net = linear_model.ElasticNet(l1_ratio=netCV.l1_ratio_,
                                  alpha=netCV.alpha_,
                                  max_iter=5e3)

    score = cross_val_score(net,
                            xtrain4,
                            ytrain.values.ravel(),
                            cv=5,
                            scoring='r2')
    print("Score of ElasticNet: ", score)

    net.fit(xtrain4, ytrain.values.ravel())
    print("Selected nr of features after cv: ", np.count_nonzero(net.coef_))

    xtrain5 = xtrain4.loc[:, np.abs(net.coef_) > 0]
    xtest5 = xtest4.loc[:, np.abs(net.coef_) > 0]

    print("Retained features: ", len(xtest5.columns.tolist()))
    print(xtest5.columns.tolist())

    print("Testing new elasticNet:")
    netCV2 = linear_model.ElasticNetCV(
        l1_ratio=[0., 0.1, 0.3, 0.5, 0.75, 0.85, 0.9, 1.],
        alphas=(0., 0.1, 0.2, 0.25, 0.3, 0.35, 0.4),
        cv=5,
        n_jobs=2,
        max_iter=5e3)
    netCV2.fit(xtrain5, ytrain.values.ravel())

    print("Selected nr of features: ", np.count_nonzero(netCV2.coef_))
    print("Selected alpha: ", netCV2.alpha_)
    print("Selected l1_ration: ", netCV2.l1_ratio_)

    net2 = linear_model.ElasticNet(l1_ratio=netCV2.l1_ratio_,
                                   alpha=netCV2.alpha_,
                                   max_iter=5e3)
    score = cross_val_score(net2,
                            xtrain5,
                            ytrain.values.ravel(),
                            cv=5,
                            scoring='r2')
    print("Score of new elasticNet: ", score)
    net2.fit(xtrain5, ytrain.values.ravel())
    print("Would use: ", np.count_nonzero(net2.coef_), " features")
def ex_1():
    X, y = datasets.fetch_openml('diabetes', as_frame=True, return_X_y=True)
    # print(X)

    # print(X.info())
    # print(X.describe())

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_2 = X_train.copy()

    plt.figure()
    X_train.boxplot()
    X_train.hist(bins=20)
    plt.figure()
    sns.boxplot(x=X_train['mass'])

    imputer_mass = impute.SimpleImputer(missing_values=0.0, strategy='mean')
    imputer_skin = impute.SimpleImputer(missing_values=0.0, strategy='mean')

    X_train[['mass']] = imputer_mass.fit_transform(X_train[['mass']])
    X_train[['skin']] = imputer_skin.fit_transform(X_train[['skin']])

    X_test[['mass']] = imputer_mass.transform(X_test[['mass']])
    X_test[['skin']] = imputer_mass.transform(X_test[['skin']])

    df_mass = X_train[['mass']]
    # print(df_mass.head(5))

    # Wykrywanie anomalii czyli odstających danych

    X_train_isolation = X_train.values
    X_train_isolation = X_train_isolation[:, [1, 5]]
    X_test_isolation = X_test.values
    X_test_isolation = X_test_isolation[:, [1, 5]]

    isolation_forest = ensemble.IsolationForest(contamination=0.05)
    isolation_forest.fit(X_train_isolation)
    y_predicted_outliers = isolation_forest.predict(X_test_isolation)
    print(y_predicted_outliers)

    plot_iris2d(X_test_isolation, y_predicted_outliers)

    clf = svm.SVC(random_state=42)
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    print(metrics.classification_report(y_test, y_predicted))

    X_train.hist()

    imputer_it = impute.IterativeImputer(missing_values=0.0)

    X_train_2[['mass']] = imputer_it.fit_transform(X_train_2[['mass']])
    X_train_2[['skin']] = imputer_it.fit_transform(X_train_2[['skin']])

    X_train_2.hist(bins=20)
    plt.figure()
    X_train_2.boxplot()

    clf_rf = ensemble.RandomForestClassifier(random_state=42)
    clf_rf.fit(X_train, y_train)
    y_predicted = clf_rf.predict(X_test)
    print(metrics.classification_report(y_test, y_predicted))

    importances = clf_rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf_rf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the impurity-based feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
            color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), indices)
    plt.xlim([-1, X.shape[1]])
    plt.show()
Exemple #13
0
# Tu uz mame jednotlive skupiny atributov, pre ktore patria rozlicne sposoby aplikacie imputovania missing values.

oxygen_attr = [
    "mean_oxygen", "std_oxygen", "kurtosis_oxygen", "skewness_oxygen"
]
glucose_attr = [
    "mean_glucose", "std_glucose", "kurtosis_glucose", "skewness_glucose"
]

vztahy_attr = ["relationship", "marital-status"]
work_attr = ["workclass", "occupation", "hours-per-week-cat", "income"]
edu_attr = ["education", "education-num"]

impute_col_transf = compose.ColumnTransformer(transformers=[
    ("oxygen_n_glucose_impute",
     KeepDataFrame(impute.IterativeImputer(max_iter=50)),
     oxygen_attr + glucose_attr
     ), ("vztahy_impute", CustomCatImputing(imputer_type="knn"), vztahy_attr),
    ("work_impute", CustomCatImputing(imputer_type="knn"),
     work_attr), ("edu_impute", CustomCatImputing(imputer_type="knn"),
                  edu_attr),
    ("sex_impute",
     KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")),
     ["sex"]), ("age_impute", KeepDataFrame(impute.SimpleImputer()), ["age"])
])

#tento column transformer sa bude pouzivat v pripade, kedy chceme pouzit v ramci celeho datasetu cisto len simpleimputer
most_freq_attr = ["sex"] + edu_attr + work_attr + vztahy_attr
mean_attr = ["age"] + oxygen_attr + glucose_attr

simple_impute_col_transf = compose.ColumnTransformer(transformers=[(
Exemple #14
0
def train(dname, mname, rseed, shuffle_params=None):
    assert (('gbm' in mname)
            or (mname in ('cegb', 'fixedmodel', 'imputemodel'))
            or ('linear' in mname) or ('nn' in mname) or ('tab' in mname)
            or ('node' in mname))

    ##################
    # DATA
    ##################
    # Load data we're using
    (Xtrain,
     ytrain), (Xvalid,
               yvalid), (Xtest, ytest), costs, groups, extras = LOADERS[dname](
                   split_seed=rseed)  #,**kwargs)

    # If we're using PACT we need some of the extra (redundant) features that were unused in our study
    if mname == 'pact':
        (Xtrain, ytrain), (Xvalid,
                           yvalid), (Xtest,
                                     ytest), costs, groups, extras = load_ed(
                                         name=config.ED_NAME,
                                         costtype=config.ED_COSTTYPE,
                                         drop_redundant=False,
                                         split_seed=rseed)

    # If we're using a non-GBM AI method, we need to impute NaNs and scale
    # Don't do this if using ICU data because we're using a Pipeline in that case
    # that handle sthis stuff
    if ('linear' in mname or 'nn' in mname or 'cwcf' in mname or 'tab' in mname
            or 'node' in mname) and (dname != 'icu'):
        imputer = impute.SimpleImputer()
        scaler = preprocessing.StandardScaler()
        Xtrain_np = scaler.fit_transform(imputer.fit_transform(Xtrain))
        Xvalid_np = scaler.transform(imputer.transform(Xvalid))
        Xtest_np = scaler.transform(imputer.transform(Xtest))

        for df, npy in zip([Xtrain, Xvalid, Xtest],
                           [Xtrain_np, Xvalid_np, Xtest_np]):
            df.iloc[:] = npy
#         Xtrain,Xvalid,Xtest = [pd.DataFrame(data=npy,columns=df.columns,index=df.index) for df,npy in zip(
#             [Xtrain_raw,Xvalid_raw,Xtest_raw],[Xtrain_np,Xvalid_np,Xtest_np])]
#     else:
#         (Xtrain,Xvalid,Xtest) = Xtrain_raw,Xvalid_raw,Xtest_raw

# Concatenated data for post-tuning
    Xtv = pd.concat([Xtrain, Xvalid])
    ytv = np.hstack((ytrain, yvalid))

    # Grouped costs for datasets tht feature it
    unique_costs = np.array([
        costs[groups == g].mean() for g in np.unique(groups)
    ]) if (dname == 'outpatient') or (
        dname == 'icu' and mname in ('linear', 'linearh', 'nn')) else costs

    ##################
    # PARAMETER TUNING
    ##################
    # If we've precomputed best parameters, just load those
    if TUNING == 'LOAD' and (
        ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')) or
        ('linear' in mname) or ('nn' in mname) or ('tab' in mname)):
        loadname = 'gbmsage' if mname == 'cegb' else mname
        with open(f'{OUTPATH}/{loadname}-{dname}-{rseed}.pkl', 'rb') as w:
            model = pickle.load(w)
    # Otherwise do some parameter tuning
    else:
        # Tune GBM
        if ('gbm' in mname) or (mname
                                in ('cegb', 'fixedmodel', 'imputemodel')):
            model = tune(Xtrain, Xvalid, ytrain, yvalid)
        # Linear model needs onehotencoding pipeline if we're doing ICU
        elif ('linear' in mname):
            if (dname == 'icu'):
                model = lintune(Xtrain,
                                Xvalid,
                                ytrain,
                                yvalid,
                                mfunc=icu_preprocessing(get_linear_model))
            else:
                model = lintune(Xtrain, Xvalid, ytrain, yvalid)
        # NN model needs onehotencoding pipeline if we're doing ICU
        elif 'nn' in mname:
            if (dname == 'icu'):
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               mfunc=icu_preprocessing(get_tf_model),
                               return_extras=False)
            else:
                model = tftune(Xtrain,
                               Xvalid,
                               ytrain,
                               yvalid,
                               return_extras=False)
        elif 'node' in mname:
            model = nodetune(Xtrain,
                             Xvalid,
                             ytrain,
                             yvalid,
                             mfunc=(icu_preprocessing(NodeClassifier)
                                    if dname == 'icu' else NodeClassifier))
            if dname != 'icu':
                bst = NodeClassifier(**model)
                bst.fit(Xtrain, ytrain, eval_set=(Xvalid, yvalid))
                iXtest = bst.dataset.transform(Xtest)
                preds = bst.predict_proba(iXtest)[:, 1]
                score = roc_auc_score(ytest, preds)
                model['test_score'] = score
        elif ('tab' in mname):
            cat_name_map = {
                'trauma': [
                    'agencylevelfromscene', 'agencymodefromscene', 'ageunits',
                    'causecode', 'ethnicity', 'formfromscene', 'race',
                    'residencestate', 'scenedestinationreason',
                    'scenerespassisted', 'sex'
                ]
            }
            cat_idx_map = {
                'trauma': [
                    i for i, c in enumerate(Xtrain.columns)
                    if c in cat_name_map['trauma']
                ]
            }
            cat_dim_map = {
                'trauma': [
                    Xtrain[c].unique().shape[0]
                    for i, c in enumerate(Xtrain.columns)
                    if c in cat_name_map['trauma']
                ]
            }
            if (dname == 'icu'):
                model = tabtune(Xtrain.values,
                                Xvalid.values,
                                ytrain,
                                yvalid,
                                mfunc=icu_preprocessing(get_linear_model))
            else:
                model = tabtune(Xtrain.values,
                                Xvalid.values,
                                ytrain,
                                yvalid,
                                cat_idxs=cat_name_map.get(dname, []),
                                cat_dims=cat_dim_map.get(dname, []),
                                cat_emb_dim=2,
                                return_score=True)
    # If we indicated we want to save the model, do so
    if TUNING == 'SAVE' and (('gbm' in mname) or
                             (mname in ('cegb', 'fixedmodel', 'imputemodel'))
                             or ('linear' in mname) or ('nn' in mname) or
                             ('tab' in mname) or ('node' in mname)):
        with open(f'{OUTPATH}/{mname}-{dname}-{rseed}.pkl', 'wb') as w:
            pickle.dump(model, w)
            exit()

    ##################
    # Setup for CoAI
    ##################
    # Instantiate predictive models
    if ('gbm' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')):
        bst = lgb.LGBMClassifier(**model)
    elif 'linear' in mname:
        bst = icu_preprocessing(FastLinearClassifier)(
            **model) if dname == 'icu' else FastLinearClassifier(**model)
    elif 'nn' in mname:
        bst = icu_preprocessing(get_fast_keras)(
            **model) if dname == 'icu' else get_fast_keras(**model)

    # Get our explainer (using SAGE entirely now, shap is old & may not work perfectly)
    if ('sage' in mname) or (mname in ('cegb', 'fixedmodel', 'imputemodel')):
        exp = labelless_sage_wrapper(imputetype='marginal',
                                     refsize=64,
                                     batch_size=32,
                                     wrap_categorical=(dname == 'icu'))
    elif mname == 'gbmshap':
        exp = OneDimExplainer
    elif mname == 'linearshap':
        exp = get_pipeline_explainer(LinearExplainer)

    # Prepare to shuffle costs if required
    if shuffle_params is not None:
        if ((shuffle_params[0] < 0) and (shuffle_params[1] < 0)):
            costs, shuffle_costs = cost_pair(-shuffle_params[0],
                                             -shuffle_params[1], Xtrain)
        else:
            shuffle_costs = cost_swaps(costs, shuffle_params[0],
                                       shuffle_params[1])
    # Pick thresholds for CoAI
    dthresh = np.linspace(0, np.sum(unique_costs) + 1, 100)

    #####################
    # Actually train/test
    #####################
    if 'sage' in mname or 'shap' in mname:
        GRP = knapsack.GroupOptimizer(bst,
                                      exp,
                                      scale_ints=1000 * 100 if
                                      ('sage' in mname) else 1000)
        if 'nn' in mname:
            if dname == 'icu':
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        model__epochs=10,
                        model__verbose=False)
            else:
                GRP.fit(Xtv,
                        ytv,
                        costs,
                        groups,
                        dthresh,
                        epochs=10,
                        verbose=False)
        else:
            GRP.fit(Xtv, ytv, costs, groups, dthresh)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)
    elif 'fixed' in mname:
        bst = bst.fit(Xtv, ytv)
        GRP = knapsack.FixedModelExactRetainer(bst, exp)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
    elif 'impute' in mname:
        imputer = impute.IterativeImputer(random_state=0,
                                          estimator=linear_model.RidgeCV())
        bst = bst.fit(Xtv, ytv)
        imputer.fit(Xtv)
        GRP = knapsack.FixedModelImputer(bst, exp, imputer)
        GRP.fit(Xtv, ytv, costs, dthresh)
        if shuffle_params: GRP.refit(Xtv, ytv, shuffle_costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
    elif mname == 'cegb':
        GRP = cegb.CEGBOptimizer(model=bst, lambdas=np.logspace(-5, 5, 101))
        GRP.fit(Xtv, ytv, costs)
        GRP.score_models_proba(Xtest, ytest, roc_auc_score)
        if dname == 'outpatient': GRP.recalculate_costs(costs, groups)
        if shuffle_params: GRP.recalculate_costs(shuffle_costs)
    elif mname == 'cwcf':
        ytrain, yvalid, ytest = [np.array(x) for x in (ytrain, yvalid, ytest)]
        print('Training CWCF...')
        lmbds = np.hstack([np.logspace(-14, 1, 16) for _ in range(2)])
        GRP = cwcf.get_cwcf(Xtrain,
                            Xvalid,
                            Xtest,
                            ytrain,
                            yvalid,
                            ytest,
                            costs,
                            lmbds,
                            gpus=list(range(8)),
                            njobs=32,
                            dirname=config.CWCF_TMPDIR,
                            metric=roc_auc_score,
                            difficulty=1000)
        print('Done')
    elif mname in ('aps', 'apacheiii', 'apacheiva'):
        strain, svalid, stest = aps_baselines()
        mpreds = stest
        mpreds = bootstrap_set(mpreds, rseed=rseed)
        preds = mpreds[mname]
        score = roc_auc_score(ytest, preds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(preds)
    elif mname in ('qsofa'):
        qtest = qsofa_score()
        qpreds = bootstrap_set(qtest, rseed=rseed)
        score = roc_auc_score(ytest, qpreds)
        cost = config.EICU_SCORE_COSTS[mname]
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array([cost]), np.array([score])
        GRP.test_preds = np.array(qpreds)
    elif mname in ('pact'):
        cost, score, _, _, _, _, preds = pact_score(Xtrain, Xvalid, Xtest,
                                                    ytrain, yvalid, ytest,
                                                    costs)
        GRP = lambda x: x
        GRP.model_costs, GRP.model_scores = np.array(cost), np.array(score)
        GRP.test_preds = np.array(preds)
    else:
        raise ValueError("Model name not found!")

    # Done
    return GRP