def get_data_for_var(var,
                     X_train_original,
                     y_train_original,
                     encoded,
                     db_path,
                     model_path,
                     age=False):
    set_random_seed()

    train_nan = y_train_original[var].isnull()

    X_train_bal, y_train_bal, y_train_ind = balance_x_y(
        X_train_original.loc[train_nan == False, :],
        y_train_original.loc[train_nan == False, var])

    if encoded:
        trained_autoencoder = get_from_file_autoencoder(db_path, model_path)
        X_train_bal = trained_autoencoder.encoder.predict(X_train_bal)
        if age:
            X_train_bal = pd.DataFrame(X_train_bal)
            X_train_bal[15] = StandardScaler().fit_transform(
                y_train_original.loc[y_train_ind, ['Age at recruitment']])
            #X_train_bal[16] = y_train_original.loc[y_train_ind, ['Sites']].reset_index(drop=True)

            X_train_bal = X_train_bal.values
    return X_train_bal, y_train_bal
Ejemplo n.º 2
0
def impute_random(arr_in):
    set_random_seed()
    arr = arr_in.copy()  # to not change the input
    arr_is_nan = np.isnan(arr)  # 1 if nan else 0
    n_nan = sum(arr_is_nan)
    arr_without_nan = arr[arr_is_nan == False]
    arr_fill = np.random.choice(arr_without_nan, size=n_nan)
    arr[arr_is_nan] = arr_fill
    return arr
Ejemplo n.º 3
0
def balance_x_y(x, y):
    set_random_seed()
    y_ind = y.index.values
    x = x.reset_index(drop=True)  #temp
    y = y.reset_index(drop=True)  #temp
    select = balanced_sample(y)
    y = y[select == 1]
    x = x.loc[select == 1]
    y_ind = y_ind[select == 1]
    return x, y, y_ind
def prepare_x_y(volumes, y, recodings):
    set_random_seed()
    volumes_ = volumes.iloc[:, 1:]  # 0th column is eid, not needed here

    y_selected = (
        y.iloc[:, 1:]  # Oth column is eid, not needed here 
        .applymap(lambda x: np.nan
                  if x < 0 else x)  # below zero codes for missing reasons
        .apply(impute_random).apply(partial(recode_variables,
                                            recodings=recodings),
                                    axis=1))
    return volumes_, y_selected
def train_model(model, X_train, y_train, hyper_param_grid, n_bootstrap=100):

    ls_df_best_ests = []
    set_random_seed()
    outer_fold = ShuffleSplit(n_splits=5, train_size=.8)
    for n_fold, (train_outer_index, test_outer_index) in enumerate(
            outer_fold.split(X_train, y_train)):

        X_outer_test = pd.DataFrame(X_train).iloc[test_outer_index]
        y_outer_test = y_train.iloc[test_outer_index]

        for _ in range(n_bootstrap):
            train_bootstrap_indx = resample(train_outer_index, replace=True)
            X_bootstrap_train = pd.DataFrame(
                X_train).iloc[train_bootstrap_indx]
            y_bootstrap_train = y_train.iloc[train_bootstrap_indx]

            LR = GridSearchCV(model,
                              param_grid=hyper_param_grid,
                              cv=7,
                              n_jobs=3,
                              return_train_score=True)

            LR.fit(X_bootstrap_train, y_bootstrap_train)
            best_ind = LR.best_index_
            df = pd.DataFrame(LR.cv_results_)

            # getting rid of not needed columns to save memory
            columns = [
                col for col in df.columns
                if (('test' in col or 'train' in col)
                    and not ('std' in col or 'mean' in col or 'rank' in col))
            ]

            df = df.loc[best_ind, columns]
            df['best_est'] = LR.best_estimator_
            df['outer_test_score'] = LR.best_estimator_.score(
                X_outer_test, y_outer_test)
            df['inner_score'] = LR.best_estimator_.score(
                X_bootstrap_train, y_bootstrap_train)

            df['n_fold'] = n_fold
            class_rep = classification_report(
                y_outer_test,
                LR.best_estimator_.predict(X_outer_test),
                output_dict=True)
            for i in range(len(np.unique(y_outer_test))):
                df[f'f1_class_{i}'] = class_rep[str(float(i))]['f1-score']

            ls_df_best_ests.append(df)

    return ls_df_best_ests
import sys
from itertools import product

from keras import metrics
from keras.optimizers import RMSprop, SGD, Adam, Adagrad

from project.models.autoencoders import SymmetricAutoencoder
from project.helper_functions import set_random_seed, create_hyperparam_list, z_standardize

import joblib

data_path = sys.argv[
    1]  # e.g './data/discovery_data/train_dump_sMRI_socialbrain_sym_r2.5_s5'
save_path = sys.argv[2]  # e.g. './results/save/unsupervised/'

set_random_seed()

vols_raw = joblib.load(data_path)
vols_standard = z_standardize(vols_raw)

default_init_params = {
    'units': [[15, 36]],
    'activations': [[None, None]],
    'use_biases': [[False]],
    'reg_l1': [0],
    'reg_l2': [0],
    'reg_cov': [0],
    'tied_weights': [False],
    'input_shape': [(36, )],
    'batch_size': [36],
}