def get_data_for_var(var, X_train_original, y_train_original, encoded, db_path, model_path, age=False): set_random_seed() train_nan = y_train_original[var].isnull() X_train_bal, y_train_bal, y_train_ind = balance_x_y( X_train_original.loc[train_nan == False, :], y_train_original.loc[train_nan == False, var]) if encoded: trained_autoencoder = get_from_file_autoencoder(db_path, model_path) X_train_bal = trained_autoencoder.encoder.predict(X_train_bal) if age: X_train_bal = pd.DataFrame(X_train_bal) X_train_bal[15] = StandardScaler().fit_transform( y_train_original.loc[y_train_ind, ['Age at recruitment']]) #X_train_bal[16] = y_train_original.loc[y_train_ind, ['Sites']].reset_index(drop=True) X_train_bal = X_train_bal.values return X_train_bal, y_train_bal
def impute_random(arr_in): set_random_seed() arr = arr_in.copy() # to not change the input arr_is_nan = np.isnan(arr) # 1 if nan else 0 n_nan = sum(arr_is_nan) arr_without_nan = arr[arr_is_nan == False] arr_fill = np.random.choice(arr_without_nan, size=n_nan) arr[arr_is_nan] = arr_fill return arr
def balance_x_y(x, y): set_random_seed() y_ind = y.index.values x = x.reset_index(drop=True) #temp y = y.reset_index(drop=True) #temp select = balanced_sample(y) y = y[select == 1] x = x.loc[select == 1] y_ind = y_ind[select == 1] return x, y, y_ind
def prepare_x_y(volumes, y, recodings): set_random_seed() volumes_ = volumes.iloc[:, 1:] # 0th column is eid, not needed here y_selected = ( y.iloc[:, 1:] # Oth column is eid, not needed here .applymap(lambda x: np.nan if x < 0 else x) # below zero codes for missing reasons .apply(impute_random).apply(partial(recode_variables, recodings=recodings), axis=1)) return volumes_, y_selected
def train_model(model, X_train, y_train, hyper_param_grid, n_bootstrap=100): ls_df_best_ests = [] set_random_seed() outer_fold = ShuffleSplit(n_splits=5, train_size=.8) for n_fold, (train_outer_index, test_outer_index) in enumerate( outer_fold.split(X_train, y_train)): X_outer_test = pd.DataFrame(X_train).iloc[test_outer_index] y_outer_test = y_train.iloc[test_outer_index] for _ in range(n_bootstrap): train_bootstrap_indx = resample(train_outer_index, replace=True) X_bootstrap_train = pd.DataFrame( X_train).iloc[train_bootstrap_indx] y_bootstrap_train = y_train.iloc[train_bootstrap_indx] LR = GridSearchCV(model, param_grid=hyper_param_grid, cv=7, n_jobs=3, return_train_score=True) LR.fit(X_bootstrap_train, y_bootstrap_train) best_ind = LR.best_index_ df = pd.DataFrame(LR.cv_results_) # getting rid of not needed columns to save memory columns = [ col for col in df.columns if (('test' in col or 'train' in col) and not ('std' in col or 'mean' in col or 'rank' in col)) ] df = df.loc[best_ind, columns] df['best_est'] = LR.best_estimator_ df['outer_test_score'] = LR.best_estimator_.score( X_outer_test, y_outer_test) df['inner_score'] = LR.best_estimator_.score( X_bootstrap_train, y_bootstrap_train) df['n_fold'] = n_fold class_rep = classification_report( y_outer_test, LR.best_estimator_.predict(X_outer_test), output_dict=True) for i in range(len(np.unique(y_outer_test))): df[f'f1_class_{i}'] = class_rep[str(float(i))]['f1-score'] ls_df_best_ests.append(df) return ls_df_best_ests
import sys from itertools import product from keras import metrics from keras.optimizers import RMSprop, SGD, Adam, Adagrad from project.models.autoencoders import SymmetricAutoencoder from project.helper_functions import set_random_seed, create_hyperparam_list, z_standardize import joblib data_path = sys.argv[ 1] # e.g './data/discovery_data/train_dump_sMRI_socialbrain_sym_r2.5_s5' save_path = sys.argv[2] # e.g. './results/save/unsupervised/' set_random_seed() vols_raw = joblib.load(data_path) vols_standard = z_standardize(vols_raw) default_init_params = { 'units': [[15, 36]], 'activations': [[None, None]], 'use_biases': [[False]], 'reg_l1': [0], 'reg_l2': [0], 'reg_cov': [0], 'tied_weights': [False], 'input_shape': [(36, )], 'batch_size': [36], }