################################## # on small runs # ytrain[5:8] = 1 # ytest[5:8] = 1 ################################## #### balancing data #### # now resample majority down to minority to achieve equal # - new name in order to not interfere with hyperopt if Smote == False: # TODO: Hvis kun undersampling Xtrain_new, ytrain_new = rand_undersample(Xtrain, ytrain, arg='majority', state=random_state_val, multi=False) else: # TODO: Hvis SMOTE + undersampling # Using mix of undersampling and smote Xtrain_new, ytrain_new = balanceData(Xtrain, ytrain, ratio = 2, random_state_val=random_state_val) #plot Normal NoAug = Xtrain_new[np.where(ytrain_new == 1)] MeanData['NoAug'][artifact_name][fold] = NoAug #plotSpectro(NoAug, min_val, max_val, artifact_names[artifact], AugMethod = "None") #plot GAN GAN_X, GAN_y = useGAN(Xtrain_new, ytrain_new, aug_ratio, GAN_epochs = GAN_epochs, experiment_name = "Spectrograms") GAN_y = GAN_y[len(Xtrain_new):] GAN_X = GAN_X[len(Xtrain_new):][np.where(GAN_y == 1)] MeanData['GAN'][artifact_name][fold] = GAN_X #plotSpectro(GAN_X, min_val, max_val, artifact_names[artifact], AugMethod = "GAN")
def runActivePipeline(self, model, HO_evals, active_ratio, smote_ratios, aug_ratios, experiment, experiment_name, init_amount_percent, n_pr_query_percent, artifact_names=None, GAN_epochs=100, noise_experiment=None, randomSampler=False, DelNan_noiseFiles=False, fast_run=False, K=5, random_state=0, save_y_true=False): """ Parameters: :param model (str): Model name. Should follow the naming of the defined models in the models.models.py script, i.e. 'LR'. :param HO_evals (int): Number of queries to call in the optimization using Bayesian Optimization with Hyperopt. :param smote_ratios (np.array(float)): array of floats. For creating 100% additional samples of the minority class, the float-value should be set to 1. Similarly 50% upsampling means a float-value of 0.5. Compatible with multiple values. :param aug_ratios (np.array(float)): array of floats. For creating 100% augmented data compared to original data, the float-value should be set to 1. Similarly 50% extra augmented data means a float-value of 0.5. Compatible with multiple values. :param experiment (str): name of the wider experiment, i.e. MixUp_experiment_No5 :param experiment_name (str): name of the pickles created. When doing Augmentation it should end with either _GAN / _Noise / _MixUp, such that the following command can work properly: if experiment_name.split("_")[-1] == 'GAN': When running the pipeline on a single artifact, one should manually write this in the 'experiment_name', i.e. experiment + model + 'Null' + aug_method :param artifact_names (list(str)): a list of the artifacts that are to be investigated. Default artifact are ['eyem', 'chew', 'shiv', 'elpp', 'musc', 'null'] as the study is carried out using the TUH EEG Artifact corpus. :param GAN_epochs (int): number of epochs to use if the GAN augmentation technique is used. :param noise_experiment (str): directory containing the noise pickles to be used. Should be None when not experimenting with Noise Addition augmentation technique. :param DelNan_noiseFiles (bool): False per default. Should be True if using data that contains NaN-values. :param fast_run (bool): True if a quick run-through of the pipeline is needed in order to detect bugs or new functions. :param K (int): number of folds for the cross-validation (CV) applied. Default value is 5. :param random_state (int): random seed for the used functions such as the train_test_split in KFold CV. :param save_y_true (bool): True if a pickle of the true y-values in each CV-fold should be saved. Returns: :return: Saves pickles of dictionaries of results from an experiment. The "results_x_.npy" file is a nested dictionary holding information about accuracy, F2-score, sensitivity score and the predicted labels on the test data for the model throughout the specified augmentation ratios, SMOTE-ratios and CV-folds for the specified artifact(s). The "ho_trials_x_.npy" file is a nested dictionary holding information about Hyperopt calls for the model throughout the specified augmentation ratios, SMOTE-ratios and CV-folds for the specified artifact(s). The "y_true_x_.npy" file is a nested dictionary holding information about the true y-labels throughout CV-folds for the specified artifact(s). """ X = LoadNumpyPickles(pickle_path=self.pickle_path, file_name=self.X_file, windowsOS=self.windowsOS) y = LoadNumpyPickles(pickle_path=self.pickle_path, file_name=self.y_file, windowsOS=self.windowsOS) ID_frame = LoadNumpyPickles(pickle_path=self.pickle_path, file_name=self.ID_file, windowsOS=self.windowsOS) # extract a subset for faster running time # X, y, ID_frame = subset(X, y, ID_frame, no_indiv=30) # apply the inclusion principle X, y, ID_frame = binary(X, y, ID_frame) # The KFold will be splitted by individuals = np.unique(ID_frame) # Choose model model_dict = self.full_model_dict[model] HO_evals = HO_evals pickle_path_aug = self.pickle_path + r"\augmentation_pickles" if noise_experiment != None: X_noise, y_noise, ID_frame_noise = prepareNoiseAddition( pickle_path_aug, noise_experiment, self.X_file, self.y_file, self.ID_file, windowsOS=self.windowsOS) if DelNan_noiseFiles: X, y, ID_frame, X_noise, y_noise, ID_frame_noise = DeleteNanNoise( X, y, ID_frame, X_noise, y_noise, ID_frame_noise, save_path=pickle_path_aug + noise_experiment, windowsOS=self.windowsOS) # Dictionary holding keys and values for all functions from the models.py file. Used to "look up" functions in the CV # and hyperoptimization part function_dict = ActiveModels.__dict__ random_state_val = random_state #### define classes #### if artifact_names is None: artifact_names = ['eyem', 'chew', 'shiv', 'elpp', 'musc', 'null'] classes = len(artifact_names) # for hyperopt data to save def unpack(x): if x: return x[0] return np.nan ho_trials = {} # fold, artifact, model, hyperopt iterations results = {} # fold, artifact, model, scores y_true_dict = {} active_results = {} # setting fold details kf = KFold( n_splits=K, random_state=random_state_val, shuffle=True ) # random state + shuffle ensured same repeated experiments for aug_ratio in aug_ratios: if aug_ratio != 0: print("\n####---------------------------------------####") print("Running a", aug_ratio, "ratio of (augmented / real) using", experiment_name.split("_")[-1]) print("####---------------------------------------####") #### Initializing dict for this Augmenation ratio: ho_trials[aug_ratio] = {} results[aug_ratio] = {} active_results[aug_ratio] = {} for ratio in smote_ratios: ratio += 1 if ratio != 1: print("\n####---------------------------------------####") print("Running a", ratio - 1, "ratio of SMOTE") print("####---------------------------------------####") i = 0 # CV fold index cross_val_time_start = time() #### initializing dict for this ratio #### ho_trials[aug_ratio][ratio] = {} # for this fold results[aug_ratio][ratio] = {} active_results[aug_ratio][ratio] = {} for train_idx, test_idx in kf.split(individuals): # single loop # while i < 1: # trainindv, testindv = train_test_split(individuals, test_size=0.20, random_state=random_state_val, shuffle = True) # REMEMBER to # the other below print("\n-----------------------------------------------") print("Running {:d}-fold CV - fold {:d}/{:d}".format( K, i + 1, K)) print("-----------------------------------------------") #### initializing data #### # their IDs trainindv = individuals[train_idx] testindv = individuals[test_idx] # their indexes in train and test train_indices = [ i for i, ID in enumerate(ID_frame) if ID in trainindv ] test_indices = [ i for i, ID in enumerate(ID_frame) if ID in testindv ] testID_list = [ ID for i, ID in enumerate(ID_frame) if ID in testindv ] X_test, y_test = X[test_indices, :], y[test_indices] # Preparing for active learning Xpool_orig, ypool_orig = X[ train_indices, :], y[train_indices] init_amount = int( np.round(init_amount_percent * len(Xpool_orig), 0)) n_pr_query = int( np.round(n_pr_query_percent * len(Xpool_orig), 0)) order = np.random.permutation(range(len(Xpool_orig))) trainset_orig = order[:init_amount] X_train_orig = np.take(Xpool_orig, trainset_orig, axis=0) y_train_orig = np.take(ypool_orig, trainset_orig, axis=0) poolidx_orig = np.arange(len(Xpool_orig), dtype=np.int) poolidx_orig = np.setdiff1d(poolidx_orig, trainset_orig) trainData_balance = [ ] #TODO: These empty list are positioned wrongly. Should be in the artifact loop. testacc_al = [] testF2_al = [] testSens_al = [] # for hyperopt split #### initializing dict for this fold #### ho_trials[aug_ratio][ratio][i] = {} # for this fold results[aug_ratio][ratio][i] = {} active_results[aug_ratio][ratio][i] = {} y_true_dict[i] = {} #### for each artifact #### for artifact in range(classes): print("\nTraining on the class: " + artifact_names[artifact] + "\n") trainset = trainset_orig Xpool, ypool = Xpool_orig, ypool_orig X_train, y_train = X_train_orig, y_train_orig poolidx = poolidx_orig query_round = 0 while len(X_train) <= active_ratio * len(Xpool_orig): #### initializing hyperopt split #### #TODO: Det her er overflødigt, vi kan bare ændre HO_individuals til trainindv (tror jeg) train_ID_frame = ID_frame[trainset] HO_individuals = np.unique(train_ID_frame) #### initializing data #### # only include the artifact of interest # new name for the ones with current artifact Xtrain = X_train # only to keep things similar Xtest = X_test # only to keep things similar ytrain = y_train[:, artifact] ytest = y_test[:, artifact] ################################## # on small runs # ytrain[5:8] = 1 # ytest[5:8] = 1 ################################## if fast_run: ytrain[:3] = 1 ytest[:3] = 1 #### balancing data #### # now resample majority down to minority to achieve equal # - new name in order to not interfere with hyperopt if ratio == 1: # if no smote Xtrain_new, ytrain_new = rand_undersample( Xtrain, ytrain, arg='majority', state=random_state_val, multi=False) else: # Using mix of undersampling and smote Xtrain_new, ytrain_new = balanceData( Xtrain, ytrain, ratio, random_state_val=random_state_val) # %% Data Augmentation step: if aug_ratio != 0: if experiment_name.split("_")[-1] == 'GAN': Xtrain_new, ytrain_new = useGAN( Xtrain_new, ytrain_new, aug_ratio, GAN_epochs, experiment_name) if experiment_name.split("_")[-1] == "MixUp": Xtrain_new, ytrain_new = useMixUp( Xtrain_new, ytrain_new, aug_ratio) if experiment_name.split("_")[-1] == "Noise": X_noise_new = X_noise[train_indices, :] y_noise_new = y_noise[train_indices, :] y_noise_new = y_noise_new[:, artifact] Xtrain_new, ytrain_new = useNoiseAddition( X_noise_new, y_noise_new, Xtrain_new, ytrain_new, aug_ratio, random_state_val) #### creating test environment #### Xtrain_new, ytrain_new = shuffle( Xtrain_new, ytrain_new, random_state=random_state_val) Xtest, ytest, testID_list = shuffle( Xtest, ytest, testID_list, random_state=random_state_val) env = models(Xtrain_new, ytrain_new, Xtest, ytest, state=random_state_val) #### initializing validation data for hyperopt #### trainindv, testindv = train_test_split( HO_individuals, test_size=0.20, random_state=random_state_val, shuffle=True) # indices of these individuals from ID_frame HO_train_indices = [ i for i, ID in enumerate(train_ID_frame) if ID in trainindv ] HO_test_indices = [ i for i, ID in enumerate(train_ID_frame) if ID in testindv ] # constructing sets HO_Xtrain, HO_ytrain = Xtrain[ HO_train_indices, :], ytrain[ HO_train_indices] # we keep the original and balance new later HO_Xtest, HO_ytest = Xtrain[ HO_test_indices, :], ytrain[HO_test_indices] if fast_run: HO_ytrain[:2] = 1 HO_ytest[:2] = 1 #### initializing validation data for hyperopt #### if ratio == 1: # if no augmentation HO_Xtrain_new, HO_ytrain_new = rand_undersample( HO_Xtrain, HO_ytrain, arg='majority', state=random_state_val, multi=False) else: # Using mix of undersampling and smote HO_Xtrain_new, HO_ytrain_new = balanceData( HO_Xtrain, HO_ytrain, ratio, random_state_val=random_state_val) if aug_ratio != 0: if experiment_name.split("_")[-1] == 'GAN': HO_Xtrain_new, HO_ytrain_new = useGAN( HO_Xtrain_new, HO_ytrain_new, aug_ratio, GAN_epochs, experiment_name) if experiment_name.split("_")[-1] == "MixUp": HO_Xtrain_new, HO_ytrain_new = useMixUp( HO_Xtrain_new, HO_ytrain_new, aug_ratio) if experiment_name.split("_")[-1] == "Noise": X_noise_new = X_noise[HO_train_indices, :] y_noise_new = y_noise[HO_train_indices, :] y_noise_new = y_noise_new[:, artifact] HO_Xtrain_new, HO_ytrain_new = useNoiseAddition( X_noise_new, y_noise_new, HO_Xtrain_new, HO_ytrain_new, aug_ratio, random_state_val) # Hyperopt environment HO_Xtrain_new, HO_ytrain_new = shuffle( HO_Xtrain_new, HO_ytrain_new, random_state=random_state_val) HO_Xtest, HO_ytest = shuffle( Xtest, ytest, random_state=random_state_val) HO_env = models(HO_Xtrain_new, HO_ytrain_new, HO_Xtest, HO_ytest, state=random_state_val) #### initializing dict for this artifact #### ho_trials[aug_ratio][ratio][i][ artifact_names[artifact]] = { } # for this artifact results[aug_ratio][ratio][i][ artifact_names[artifact]] = {} y_true_dict[i][artifact_names[artifact]] = {} active_results[aug_ratio][ratio][i][ artifact_names[artifact]] = {} # https://medium.com/district-data-labs/parameter-tuning-with-hyperopt-faa86acdfdce # https://towardsdatascience.com/hyperparameter-optimization-in-python-part-2-hyperopt-5f661db91324 # http://hyperopt.github.io/hyperopt/getting-started/search_spaces/ start_time = time() name, space = model_dict[model] #### HyperOpt #### if space is not None: # if hyperopt is defined #### initializing dict for this model #### ho_trials[aug_ratio][ratio][i][ artifact_names[artifact]][model] = { } # for this model print('\nHyperOpt on: ', model) # print model name trials = Trials() def objective(params): accuracy, f2_s, sensitivity, y_pred, weights = function_dict[ name](HO_env, **params) # hyperopt environment # it minimizes return -sensitivity best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=HO_evals, trials=trials) #### saving evaluations #### ho_trials[aug_ratio][ratio][i][artifact_names[ artifact]][model] = pd.DataFrame([ pd.Series( t["misc"]["vals"]).apply(unpack) for t in trials ]) ho_trials[aug_ratio][ratio][i][artifact_names[ artifact]][model]["sensitivity"] = [ -t["result"]["loss"] for t in trials ] print('best parameter/s:', best) # define best found function f = function_dict[name]( env, **best) # now test environment # without hyperopt else: # space is none f = function_dict[name](env) end_time = time() took_time = (end_time - start_time) print(model + ": \t" + str(f[:3]) + ". Time: {:f} seconds".format(took_time)) """ This part using the modAL library is currently broken. AL_env = ActiveModels(X_train=Xtrain, y_train=ytrain, X_test=Xtest, y_test=ytest, state=random_state_val) f = AL_env.LR(**best) modAL_learner = ActiveLearner(estimator=AL_env.model, #query_strategy=expected_error_reduction, X_training=AL_env.X_train, y_training=AL_env.y_train) query_idx, query_inst = modAL_learner.query(X_pool=Xpool, n_instances=1) modAL_learner.teach(Xpool[query_idx], ypool[query_idx]) """ acc, F2, sensitivity, y_pred, weights = f if randomSampler: order_sub = np.random.permutation(poolidx) newIdxsPool = order_sub[:n_pr_query] X_train = np.concatenate( (X_train, Xpool[newIdxsPool])) y_train = np.concatenate( (y_train, ypool[newIdxsPool])) poolidx = np.setdiff1d(poolidx, newIdxsPool) print('Model: LR, %i samples (Random)' % (init_amount + query_round * n_pr_query)) else: emc = norm_grad_x_LR(weights, Xpool[poolidx]) ypool_p_sort_idx = np.argsort(emc) X_train = np.concatenate((X_train, Xpool[ poolidx[ypool_p_sort_idx[-n_pr_query:]]])) y_train = np.concatenate((y_train, ypool[ poolidx[ypool_p_sort_idx[-n_pr_query:]]])) poolidx = np.setdiff1d( poolidx, ypool_p_sort_idx[-n_pr_query:]) print('Model: LR, %i samples (EMC)' % (init_amount + query_round * n_pr_query)) counts = np.unique(ytrain, return_counts=True)[1] trainData_balance.append( (len(Xtrain), counts[1] / counts[0])) testacc_al.append((len(Xtrain), acc)) testF2_al.append((len(Xtrain), F2)) testSens_al.append((len(Xtrain), sensitivity)) query_round += 1 active_results[aug_ratio][ratio][i][ artifact_names[artifact]][model] = {} active_results[aug_ratio][ratio][i][artifact_names[ artifact]][model]['accuracy'] = testacc_al active_results[aug_ratio][ratio][i][ artifact_names[artifact]][model]['F2'] = testF2_al active_results[aug_ratio][ratio][i][artifact_names[ artifact]][model]['sensitivity'] = testSens_al active_results[aug_ratio][ratio][i][artifact_names[ artifact]][model]['balance'] = trainData_balance #### initializing dictionary of results for this model #### results[aug_ratio][ratio][i][ artifact_names[artifact]][model] = {} #### saving results[aug_ratio] #### results[aug_ratio][ratio][i][ artifact_names[artifact]][model]['y_pred'] = y_pred results[aug_ratio][ratio][i][ artifact_names[artifact]][model]['accuracy'] = acc results[aug_ratio][ratio][i][artifact_names[artifact]][ model]['weighted_F2'] = F2 results[aug_ratio][ratio][i][artifact_names[artifact]][ model]['sensitivity'] = sensitivity if aug_ratio == 0: y_true_dict[i][artifact_names[artifact]][ "y_true"] = env.y_test y_true_dict[i][artifact_names[artifact]][ "ID_list"] = testID_list # new fold i += 1 cross_val_time_end = time() cross_val_time = cross_val_time_end - cross_val_time_start print("The cross-validation for ratio" + str(ratio - 1) + " took " + str(np.round(cross_val_time, 3)) + " seconds = " + str(np.round(cross_val_time / 60, 3)) + " minutes") print('\n\n') results[aug_ratio][ratio]['time'] = cross_val_time #### saving data #### # Remember to change name of pickle when doing a new first_pilot if self.windowsOS: os.makedirs(self.pickle_path + r"\results\performance" + "\\" + experiment, exist_ok=True) os.makedirs(self.pickle_path + r"\results\hyperopt" + "\\" + experiment, exist_ok=True) os.makedirs(self.pickle_path + r"\results\y_true", exist_ok=True) SaveNumpyPickles( self.pickle_path + r"\results\performance" + "\\" + experiment, r"\results" + experiment_name, results, self.windowsOS) SaveNumpyPickles( self.pickle_path + r"\results\hyperopt" + "\\" + experiment, r"\ho_trials" + experiment_name, ho_trials, self.windowsOS) # Active pickles os.makedirs(self.pickle_path + r"\results\performance" + r"\active" + experiment, exist_ok=True) os.makedirs(self.pickle_path + r"\results\hyperopt" + r"\active" + experiment, exist_ok=True) SaveNumpyPickles( self.pickle_path + r"\results\performance" + r"\active" + experiment, r"\resultsActive" + experiment_name, active_results, self.windowsOS) if save_y_true: SaveNumpyPickles( self.pickle_path + r"\results\y_true", r"\y_true_" + str(K) + "fold_randomstate_" + str(random_state_val), y_true_dict, self.windowsOS) else: os.makedirs(self.pickle_path + r"results/performance" + "/" + experiment, exist_ok=True) os.makedirs(self.pickle_path + r"results/hyperopt" + "/" + experiment, exist_ok=True) os.makedirs(self.pickle_path + r"results/y_true", exist_ok=True) SaveNumpyPickles(self.pickle_path + r"results/performance" + "/" + experiment, r"/results" + experiment_name, results, windowsOS=self.windowsOS) SaveNumpyPickles(self.pickle_path + r"results/hyperopt" + "/" + experiment, r"/ho_trials" + experiment_name, ho_trials, windowsOS=self.windowsOS) #Active results os.makedirs(self.pickle_path + r"/results/performance" + "/active" + experiment, exist_ok=True) os.makedirs(self.pickle_path + r"/results/hyperopt" + "/active" + experiment, exist_ok=True) SaveNumpyPickles( self.pickle_path + r"/results/performance" + r"/active" + experiment, r"/resultsActive" + experiment_name, active_results, self.windowsOS) if save_y_true: SaveNumpyPickles(self.pickle_path + r"results/y_true", r"/y_true_" + str(K) + "fold_randomstate_" + str(random_state_val) + y_true_dict, windowsOS=self.windowsOS)
X_under, y_under = rand_undersample(X, y_arti, arg='majority', state=random_state_val, multi=False) #extracting samples X_under_test, y_under_test, indices = subsample(X_under, y_under, bal_n) X_under = np.delete(X_under, indices, axis=0) y_under = np.delete(y_under, indices, axis=0) #running pca pca_under, Xstd_under = runPCA(X_under, y_under) #finding standardizing values Xmean_under, Xerr_under = np.mean(X_under, axis=0), np.std(X_under, axis=0) #standardizing sample X_under_test = (X_under_test - Xmean_under) / Xerr_under #extracting samples X_under_sample, y_under_sample, indices = subsample(X_under_test, y_under_test, subsample_n) #SMOTE for this artifact X_smote, y_smote = balanceData(X, y_arti, ratio=aug_smote_ratio , random_state_val=random_state_val) #extracting samples X_smote_test, y_smote_test, indices = subsample(X_smote, y_smote, bal_n) X_smote = np.delete(X_smote, indices, axis=0) y_smote = np.delete(y_smote, indices, axis=0) #running pca pca_smote, Xstd_smote = runPCA(X_smote, y_smote) #finding standardizing values Xmean_smote, Xerr_smote = np.mean(X_smote, axis=0), np.std(X_smote, axis=0) #standardizing sample X_smote_test = (X_smote_test - Xmean_smote) / Xerr_smote #extracting samples X_smote_sample, y_smote_sample, indices = subsample(X_smote_test, y_smote_test, subsample_n) ### Explained variance ###