def __init__(self, p_type, algorithms=None, hyperparameters=None, verbose=False, n_cores=mp.cpu_count(), runtime_limit=512, selection_method='min_variance', scalarization='D', error_matrix=None, runtime_matrix=None, stacking_alg='greedy', **stacking_hyperparams): # TODO: check if arguments to constructor are valid; set to defaults if not specified assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \ "either qr (QR decomposition) or min_variance (minimize variance with time constraints)." with open(os.path.join(DEFAULTS, p_type + '.json')) as file: defaults = json.load(file) # attributes of ML problem self.p_type = p_type.lower() self.algorithms = algorithms or defaults['algorithms'] self.hyperparameters = hyperparameters or defaults['hyperparameters'] self.verbose = verbose # computational considerations self.n_cores = n_cores self.runtime_limit = runtime_limit # sample column selection self.selection_method = selection_method self.scalarization = scalarization # error matrix attributes # TODO: determine whether to generate new error matrix or use default/subset of default self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix assert util.check_dataframes(self.error_matrix, self.runtime_matrix) self.column_headings = np.array( [eval(heading) for heading in list(self.error_matrix)]) self.X, self.Y, _ = linalg.pca(self.error_matrix.values, rank=min(self.error_matrix.shape) - 1) # sampled & fitted models self.new_row = np.zeros((1, self.error_matrix.shape[1])) self.sampled_indices = set() self.sampled_models = [None] * self.error_matrix.shape[1] self.fitted_indices = set() self.fitted_models = [None] * self.error_matrix.shape[1] # ensemble attributes self.stacking_alg = stacking_alg self.stacking_hyperparams = stacking_hyperparams self.ensemble = Ensemble(self.p_type, self.stacking_alg, self.stacking_hyperparams)
def doubling(): k, t = ranks[0], times[0] counter, self.best = 0, 0 while time.time() - start < self.runtime_limit - t: if verbose: print('Fitting with k={}, t={}'.format(k, t)) t0 = time.time() if self.build_ensemble: self.ensemble = Ensemble(self.p_type, self.ensemble_method, self.stacking_hyperparams) else: self.ensemble = Model_collection(self.p_type) self._fit(x_tr, y_tr, rank=k, runtime_limit=t) if self.build_ensemble: loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type) # TEMPORARY: Record intermediate results e_hat.append(np.copy(self.new_row)) actual_times.append(time.time() - start) sampled.append(self.sampled_indices) ensembles.append(self.ensemble) losses.append(loss) if loss == min(losses): ranks.append(k + 1) self.best = counter else: ranks.append(k) times.append(2 * t) k = ranks[-1] t = times[-1] counter += 1
def __init__(self, p_type, algorithms=None, hyperparameters=None, n_cores=None, verbose=False, stacking_alg='Logit', **stacking_hyperparams): # check if arguments to constructor are valid; set to defaults if not specified default, new = util.check_arguments(p_type, algorithms, hyperparameters) self.p_type = p_type.lower() self.algorithms = algorithms self.hyperparameters = hyperparameters self.n_cores = n_cores self.verbose = verbose if len(new) > 0: # if selected hyperparameters contain model configurations not included in default proceed = input( "Your selected hyperparameters contain some not included in the default error matrix. \n" "Do you want to generate your own error matrix? [yes/no]") if proceed == 'yes': subprocess.call(['./generate_matrix.sh']) # TODO: load newly generated error matrix file else: return else: # use default error matrix (or subset of) path = pkg_resources.resource_filename( __name__, 'defaults/error_matrix.csv') default_error_matrix = pd.read_csv(path, index_col=0) column_headings = np.array( [eval(heading) for heading in list(default_error_matrix)]) selected_indices = np.array( [heading in column_headings for heading in default]) self.error_matrix = default_error_matrix.values[:, selected_indices] self.column_headings = sorted(default, key=lambda d: d['algorithm']) self.ensemble = Ensemble(self.p_type, stacking_alg, **stacking_hyperparams) self.optimized_settings = [] self.new_row = None
def build_ensemble(model_dims, paths=["models/model-20.pkl", "models_2009/model-20.pkl", "models_2010/model-20.pkl", "models_2011/model-20.pkl"]): # build the model ensembles given the model architecture and trained models models = [] for path in paths: net = RetailModel(model_dims) net.load_state_dict(torch.load(path)) models.append(net) ensemble = Ensemble(models) return ensemble
def train(CONFIG): #creat result folder and save config as txt file t = time.strftime('%Y_%m_%d_%H_%M_%S') results_dir = os.path.join(CONFIG['SAVE_PATH'], t) if not os.path.isdir(results_dir): os.makedirs(results_dir) with open(os.path.join(results_dir, 'Settings.txt'), 'w') as file: file.write(json.dumps(CONFIG)) # creat train dataset train_dataset = generate_stem_dataset(CONFIG['DATA_PATH'], CONFIG['INPUT_SIZE'], CONFIG['DATA_AUGMENTATION']) # split train dataset for 5-fold stratified cross validation kf = model_selection.KFold(n_splits=5, shuffle=True) for fold_num, (train_index, test_index) in enumerate(kf.split(train_dataset)): #creat sub_train&sub_test dataset train_subset = torch.utils.data.Subset(train_dataset, train_index) test_subset = torch.utils.data.Subset(train_dataset, test_index) #define dynamic weighted resampler train_targets = [item[1] for item in train_subset] weighted_sampler = ScheduledWeightedSampler(len(train_subset), train_targets, True) #creat dataloader train_loader = DataLoader(train_subset, batch_size=CONFIG['BATCH_SIZE'], sampler=weighted_sampler, num_workers=CONFIG['NUM_WORKERS'], drop_last=False) test_loader = DataLoader(test_subset, batch_size=CONFIG['BATCH_SIZE'], num_workers=CONFIG['NUM_WORKERS'], shuffle=False) # define model m1 = AlexNetDR() m2 = GoogleNetDR() m1 = load_pretrain_param_alexnet(m1) m2 = load_pretrain_param_googlenet(m2) model = Ensemble(m1, m2) model = model.cuda(CONFIG['NUM_GPU']) # load pretrained weights if CONFIG['PRETRAINED_PATH']: checkpoint = torch.load(CONFIG['PRETRAINED_PATH']) model.load_state_dict(checkpoint) # define loss and optimizer if CONFIG['LOSS_FUNC'] == 'CrossEntropyLoss': criterion = nn.CrossEntropyLoss() elif CONFIG['LOSS_FUNC'] == 'MSELoss': criterion = nn.MSELoss() else: raise NotImplementedError if CONFIG['OPTIMIZER'] == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=CONFIG['LEARNING_RATE'], momentum=CONFIG['MOMENTUM'], nesterov=True, weight_decay=CONFIG['WEIGHT_DECAY']) elif CONFIG['OPTIMIZER'] == 'ADAM': optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['LEARNING_RATE'], betas=CONFIG['BETAS'], eps=CONFIG['EPS'], weight_decay=CONFIG['WEIGHT_DECAY']) else: raise NotImplementedError # learning rate decay lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=CONFIG['MILESTONES'], gamma=CONFIG['GAMMA']) # train max_kappa = 0 record_epochs, accs, losses, kappa_per_fold = [], [], [], [] for epoch in range(1, CONFIG['EPOCHS'] + 1): # resampling weight update if weighted_sampler: weighted_sampler.step() # learning rate update if lr_scheduler: lr_scheduler.step() if epoch in lr_scheduler.milestones: print_msg('Learning rate decayed to {}'.format( lr_scheduler.get_lr()[0])) epoch_loss = 0 correct = 0 total = 0 progress = tqdm(enumerate(train_loader)) for step, train_data in progress: X, y = train_data # X.dtype is torch.float32, y.dtype is torch.int64 X, y = X.cuda(CONFIG['NUM_GPU']), y.float().cuda( CONFIG['NUM_GPU']) # forward y_pred = model(X) y_one_hot = torch.zeros(y.shape[0], 5).cuda(CONFIG['NUM_GPU']) y_one_hot[range(y_one_hot.shape[0]), y.to(dtype=torch.int64)] = 1 loss = criterion(y_pred, y_one_hot) # backward optimizer.zero_grad() loss.backward() optimizer.step() # metrics epoch_loss += loss.item() total += y.size(0) correct += accuracy(torch.argmax(y_pred, dim=1), y, method='5_class_vec_output') * y.size(0) avg_loss = epoch_loss / (step + 1) avg_acc = correct / total progress.set_description( 'Fold {} Epoch: {}/{}, loss: {:.6f}, acc: {:.4f}'.format( fold_num + 1, epoch, CONFIG['EPOCHS'], avg_loss, avg_acc)) # save model and kappa score&confusion matrix acc, c_matrix, kappa, all_pred = _eval(model, test_loader, CONFIG) print('validation accuracy: {}, kappa: {}'.format(acc, kappa)) if kappa > max_kappa: torch.save( model.state_dict(), results_dir + '/fold' + str(fold_num + 1) + '_best_kappa.pth') max_kappa = kappa print_msg('Fold {} of 5. Best kappa model save at {}'.format( fold_num + 1, results_dir)) print_msg( 'Fold ' + str(fold_num + 1) + ' of 5. Confusion matrix with best kappa is:\n', c_matrix) # ks_dataframe = pd.DataFrame({'file_name':[sampler[0] for sampler in test_dataset.samples], # 'truth':[sampler[1] for sampler in test_dataset.samples], # 'prediction':list(all_pred), # 'kappa_score':''}) # ks_dataframe.at[0,'kappa_score'] = kappa # ks_dataframe.to_csv(os.path.join(results_dir,'test_kappa_score.csv'),index=False,sep=',') np.savetxt(os.path.join( results_dir, 'ford' + str(fold_num + 1) + '_confusion_matrix.csv'), np.array(c_matrix), delimiter=',') with open( os.path.join( results_dir, 'ford' + str(fold_num + 1) + '_kappa_score.txt'), 'w') as f: f.write('Best kappa: {}'.format(kappa)) # record record_epochs.append(epoch) accs.append(acc) losses.append(avg_loss) kappa_per_fold.append(max_kappa) print('\nBest validation kappa score for fold 1 to 5:\n {}'.format( kappa_per_fold)) return record_epochs, accs, losses
def __init__(self, p_type='classification', algorithms=None, hyperparameters=None, verbose=False, n_cores=mp.cpu_count(), runtime_limit=512, dataset_ratio_threshold=100, selection_method='min_variance', scalarization='D', error_matrix=None, runtime_matrix=None, new_row=None, build_ensemble=True, ensemble_method='greedy', runtime_predictor='KNeighborsRegressor', solver='scipy', **stacking_hyperparams): # TODO: check if arguments to constructor are valid; set to defaults if not specified assert selection_method in {'qr', 'min_variance', 'random'}, "The method to select entries to sample must be " \ "either qr (QR decomposition), min_variance (minimize variance with time constraints), or random (time-constrained random selection, for testing purpose)." with open(os.path.join(DEFAULTS, p_type + '.json')) as file: defaults = json.load(file) # attributes of ML problem self.p_type = p_type.lower() self.algorithms = algorithms or defaults['algorithms'] self.hyperparameters = hyperparameters or defaults['hyperparameters'] self.verbose = verbose # computational considerations self.n_cores = n_cores self.runtime_limit = runtime_limit # sample column selection self.selection_method = selection_method self.scalarization = scalarization # error matrix attributes # TODO: determine whether to generate new error matrix or use default/subset of default self.error_matrix = util.extract_columns( ERROR_MATRIX, self.algorithms, self.hyperparameters) if error_matrix is None else error_matrix self.runtime_matrix = util.extract_columns( RUNTIME_MATRIX, self.algorithms, self.hyperparameters) if runtime_matrix is None else runtime_matrix assert util.check_dataframes(self.error_matrix, self.runtime_matrix) self.column_headings = np.array( [eval(heading) for heading in list(self.error_matrix)]) self.X, self.Y, _ = linalg.pca(self.error_matrix.values, rank=min(self.error_matrix.shape) - 1) # sampled & fitted models self.new_row = new_row or np.zeros((1, self.error_matrix.shape[1])) self.sampled_indices = set() self.sampled_models = [None] * self.error_matrix.shape[1] self.fitted_indices = set() self.fitted_models = [None] * self.error_matrix.shape[1] # ensemble attributes self.build_ensemble = build_ensemble self.ensemble_method = ensemble_method self.stacking_hyperparams = stacking_hyperparams if self.build_ensemble: self.ensemble = Ensemble(self.p_type, self.ensemble_method, self.stacking_hyperparams) else: self.ensemble = Model_collection(self.p_type) # runtime predictor self.runtime_predictor = runtime_predictor self.dataset_ratio_threshold = dataset_ratio_threshold
class AutoLearner: """An object representing an automatically tuned machine learning model. Attributes: p_type (str): Problem type. One of {'classification', 'regression'}. algorithms (list): A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM', 'kSVM']). hyperparameters (dict): A nested dict of hyperparameters to be considered; see above for example. n_cores (int): Maximum number of cores over which to parallelize (None means no limit). verbose (bool): Whether or not to generate print statements when a model finishes fitting. stacking_alg (str): Algorithm type to use for stacked learner. **stacking_hyperparams (dict): Hyperparameter settings of stacked learner. """ def __init__(self, p_type, algorithms=None, hyperparameters=None, n_cores=None, verbose=False, stacking_alg='Logit', **stacking_hyperparams): # check if arguments to constructor are valid; set to defaults if not specified default, new = util.check_arguments(p_type, algorithms, hyperparameters) self.p_type = p_type.lower() self.algorithms = algorithms self.hyperparameters = hyperparameters self.n_cores = n_cores self.verbose = verbose if len(new) > 0: # if selected hyperparameters contain model configurations not included in default proceed = input( "Your selected hyperparameters contain some not included in the default error matrix. \n" "Do you want to generate your own error matrix? [yes/no]") if proceed == 'yes': subprocess.call(['./generate_matrix.sh']) # TODO: load newly generated error matrix file else: return else: # use default error matrix (or subset of) path = pkg_resources.resource_filename( __name__, 'defaults/error_matrix.csv') default_error_matrix = pd.read_csv(path, index_col=0) column_headings = np.array( [eval(heading) for heading in list(default_error_matrix)]) selected_indices = np.array( [heading in column_headings for heading in default]) self.error_matrix = default_error_matrix.values[:, selected_indices] self.column_headings = sorted(default, key=lambda d: d['algorithm']) self.ensemble = Ensemble(self.p_type, stacking_alg, **stacking_hyperparams) self.optimized_settings = [] self.new_row = None def fit(self, x_train, y_train): """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then perform Bayesian optimization and construct an optimal ensemble model. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. """ print('Data={}'.format(x_train.shape)) self.new_row = np.zeros((1, self.error_matrix.shape[1])) known_indices = linalg.pivot_columns(self.error_matrix) print('Sampling {} entries of new row...'.format(len(known_indices))) pool1 = mp.Pool(self.n_cores) sample_models = [ Model(self.p_type, self.column_headings[i]['algorithm'], self.column_headings[i]['hyperparameters'], verbose=self.verbose) for i in known_indices ] sample_model_errors = [ pool1.apply_async(Model.kfold_fit_validate, args=[m, x_train, y_train, 5]) for m in sample_models ] pool1.close() pool1.join() for i, error in enumerate(sample_model_errors): self.new_row[:, known_indices[i]] = error.get()[0].mean() # TODO: add predictions to second layer matrix? self.new_row = linalg.impute(self.error_matrix, self.new_row, known_indices) # Add new row to error matrix at the end (might be incorrect?) # self.error_matrix = np.vstack((self.error_matrix, self.new_row)) # TODO: Fit ensemble candidates (?) if self.verbose: print('\nConducting Bayesian optimization...') n_models = 3 pool2 = Pool(self.n_cores) bayesian_opt_models = [ Model(self.p_type, self.column_headings[i]['algorithm'], self.column_headings[i]['hyperparameters'], verbose=self.verbose) for i in np.argsort(self.new_row.flatten())[:n_models] ] optimized_hyperparams = pool2.map(Model.bayesian_optimize, bayesian_opt_models) pool2.close() pool2.join() for i, params in enumerate(optimized_hyperparams): bayesian_opt_models[i].hyperparameters = params self.ensemble.add_base_learner(bayesian_opt_models[i]) self.optimized_settings.append({ 'algorithm': bayesian_opt_models[i].algorithm, 'hyperparameters': bayesian_opt_models[i].hyperparameters }) if self.verbose: print('\nFitting optimized ensemble...') self.ensemble.fit(x_train, y_train) self.ensemble.fitted = True if self.verbose: print('\nAutoLearner fitting complete.') def refit(self, x_train, y_train): """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. """ assert self.ensemble.fitted, "Cannot refit unless model has been fit." self.ensemble.fit(x_train, y_train) def predict(self, x_test): """Generate predictions on test data. Args: x_test (np.ndarray): Features of the test dataset. """ return self.ensemble.predict(x_test)
def predict(args): import data_loader from model import DenseNet103, Ensemble import time os.makedirs(args.output_path, exist_ok=True) # Get image names img_files = sorted( glob.glob(os.path.join(args.input_image_dir, args.image_glob))) # basenames w/o extensions img_names = [osp.splitext(osp.basename(x))[0] for x in img_files] # Load models and build ensemble models = [] for m in [args.model_weights]: mdl = DenseNet103(n_classes=args.n_classes) mdl.load_state_dict(torch.load(m, map_location='cpu')) if torch.cuda.is_available(): mdl = mdl.cuda() print('Moved model to CUDA compute device.') else: print('No CUDA device available. Using CPU.') mdl.train(False) models.append(mdl) ens = Ensemble(models) print('Models loaded.') if args.transform.lower() == 'crop512raw': transform_train = data_loader.crop512raw transform_test_pre = None transform_test_post = data_loader.predcrop512 elif args.transform.lower() == 'crop512': transform_train = data_loader.crop512 transform_test_pre = data_loader.predcrop512_pre transform_test_post = data_loader.predcrop512 elif args.transform.lower() == 'custom': transform_train = build_transform(resize=args.transform_resize, crop_sz=args.transform_crop_sz, crop_type='random', flips=True, method='scale_center', to_tensor=True) transform_test_pre = build_transform( resize=args.transform_resize, crop_sz=None, crop_type=None, flips=False, method='scale_center', to_tensor=False, ) transform_test_post = build_transform(resize=None, crop_sz=None, crop_type=None, method='scale_center', flips=False, to_tensor=True) else: raise ValueError('`transform` argument `%s` in invalid.' % args.transform) pl = data_loader.PredCropLoader(args.input_image_dir, transform_pre=transform_test_pre, transform_post=transform_test_post, dtype='uint16', img_regex=args.image_glob, n_windows=4) sm = torch.nn.Softmax2d() # Segment images print('Segmenting...') times = [] for i in range(len(pl)): start = time.time() samples = pl[i] print('Number of samples %d' % len(samples)) outs = [] # np.ndarrays of softmax probs with torch.no_grad(): for d in samples: input_panel = d['image'].unsqueeze(0) # add empty batch print('Input panel size ', input_panel.size()) if torch.cuda.is_available(): input_panel = input_panel.cuda() input_panel.requires_grad = False output = ens(input_panel) probs = sm(output) probs = probs.cpu().data.numpy() # unpack to numpy array outs.append(probs) # call a mask for each set of probs mask_panels = [] for sm_panel in outs: mask = post_process(sm_panel, min_sm_prob=1. / args.n_classes, sz_min=200, erod=None, dil=None) mask_panels.append(mask) # reconstruct total mask from masks mask_sz = mask_panels[0].shape n_per_side = int(np.sqrt(len(mask_panels))) total_mask = np.zeros( (mask_sz[0] * n_per_side, mask_sz[1] * n_per_side)) for j in range(len(mask_panels)): total_mask[(j // n_per_side) * mask_sz[0]:((j // n_per_side) + 1) * mask_sz[0], (j % n_per_side) * mask_sz[1]:((j % n_per_side) + 1) * mask_sz[1]] = mask_panels[j] print('Upsampling size: ', tuple(args.upsamp_sz)) maskR = imresize(total_mask.astype('uint8'), tuple(args.upsamp_sz), interp='nearest') # upsample maskR = maskR.astype('bool').astype('uint8') # save upsamples mask imsave( os.path.join(args.output_path, img_names[i] + args.mask_suffix + '.png'), maskR * 255) print('Processed ', img_names[i]) end = time.time() times.append(end - start) print('Average image processing time : ', np.mean(times)) return
class AutoLearner: """An object representing an automatically tuned machine learning model. Attributes: p_type (str): Problem type. One of {'classification', 'regression'}. algorithms (list): A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM']). hyperparameters (dict): A nested dict of hyperparameters to be considered; see above for example. verbose (bool): Whether or not to generate print statements when a model finishes fitting. n_cores (int): Maximum number of cores over which to parallelize (None means no limit). runtime_limit(int): Maximum training time for AutoLearner (powers of 2 preferred). selection_method (str): Method of selecting entries of new row to sample. scalarization (str): Scalarization of objective for min-variance selection. Either 'D' or 'A'. error_matrix (DataFrame): Error matrix to use for imputation; includes index and headers. runtime_matrix (DataFrame): Runtime matrix to use for runtime prediction; includes index and headers. column_headings (list): Column headings of error/runtime matrices; list of dicts. X, Y (np.ndarray): PCA decomposition of error matrix. new_row (np.ndarray): Predicted row of error matrix. sampled_indices (set): Indices of new row that have been sampled. sampled_models (list): List of models that have been sampled (i.e. k-fold fitted). fitted_indices (set): Indices of new row that have been fitted (i.e. included in ensemble) fitted_models (list): List of models that have been fitted. stacking_alg (str): Algorithm type to use for stacked learner. """ def __init__(self, p_type, algorithms=None, hyperparameters=None, verbose=False, n_cores=mp.cpu_count(), runtime_limit=512, selection_method='min_variance', scalarization='D', error_matrix=None, runtime_matrix=None, stacking_alg='greedy', **stacking_hyperparams): # TODO: check if arguments to constructor are valid; set to defaults if not specified assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \ "either qr (QR decomposition) or min_variance (minimize variance with time constraints)." with open(os.path.join(DEFAULTS, p_type + '.json')) as file: defaults = json.load(file) # attributes of ML problem self.p_type = p_type.lower() self.algorithms = algorithms or defaults['algorithms'] self.hyperparameters = hyperparameters or defaults['hyperparameters'] self.verbose = verbose # computational considerations self.n_cores = n_cores self.runtime_limit = runtime_limit # sample column selection self.selection_method = selection_method self.scalarization = scalarization # error matrix attributes # TODO: determine whether to generate new error matrix or use default/subset of default self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix assert util.check_dataframes(self.error_matrix, self.runtime_matrix) self.column_headings = np.array( [eval(heading) for heading in list(self.error_matrix)]) self.X, self.Y, _ = linalg.pca(self.error_matrix.values, rank=min(self.error_matrix.shape) - 1) # sampled & fitted models self.new_row = np.zeros((1, self.error_matrix.shape[1])) self.sampled_indices = set() self.sampled_models = [None] * self.error_matrix.shape[1] self.fitted_indices = set() self.fitted_models = [None] * self.error_matrix.shape[1] # ensemble attributes self.stacking_alg = stacking_alg self.stacking_hyperparams = stacking_hyperparams self.ensemble = Ensemble(self.p_type, self.stacking_alg, self.stacking_hyperparams) def fit(self, x_train, y_train, rank=None, runtime_limit=None): """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then construct an optimal ensemble model. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. rank (int): Rank of error matrix factorization. runtime_limit (float): Maximum time to allocate to AutoLearner fitting. """ # set to defaults if not provided rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01) runtime_limit = runtime_limit or self.runtime_limit if self.verbose: print('Fitting AutoLearner with max. runtime {}s'.format( runtime_limit)) t_predicted = convex_opt.predict_runtime( x_train.shape, runtime_matrix=self.runtime_matrix) t0 = time.time() while time.time() - t0 < runtime_limit / 2: # set of algorithms that are predicted to run in given budget options = np.where(t_predicted <= runtime_limit / 2 - (time.time() - t0))[0] # remove algorithms that have been sampled already options = list(set(options) - self.sampled_indices) if len(options) == 0: if len(self.ensemble.candidate_learners) == 0: to_sample = np.argmin(t_predicted) else: break else: to_sample = np.random.choice(options) self.sampled_indices.add(to_sample) self.sampled_models[to_sample] = Model( self.p_type, self.column_headings[to_sample]['algorithm'], self.column_headings[to_sample]['hyperparameters'], self.verbose, to_sample) self.sampled_models[to_sample].kfold_fit_validate( x_train, y_train, 5) self.ensemble.candidate_learners.append( self.sampled_models[to_sample]) if self.verbose: print('\nFitting ensemble of max. size {}...'.format( len(self.ensemble.candidate_learners))) remaining = runtime_limit - (time.time() - t0) self.ensemble.fit(x_train, y_train, remaining, self.fitted_models) for model in self.ensemble.base_learners: assert model.index is not None self.fitted_indices.add(model.index) self.fitted_models[model.index] = model self.ensemble.fitted = True if self.verbose: print('\nAutoLearner fitting complete.') def fit_doubling(self, x_train, y_train, verbose=False): """Fit an AutoLearner object, iteratively doubling allowed runtime.""" t_predicted = convex_opt.predict_runtime(x_train.shape) # split data into training and validation sets try: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, stratify=y_train, random_state=0) except ValueError: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, random_state=0) ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)] t_init = 2**np.floor( np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum())) t_init = max(1, t_init) times = [t_init] losses = [1.0] e_hat, actual_times, sampled, ensembles = [], [], [], [] k, t = ranks[0], times[0] start = time.time() counter, best = 0, 0 while time.time() - start < self.runtime_limit - t: if verbose: print('Fitting with k={}, t={}'.format(k, t)) t0 = time.time() self.ensemble = Ensemble(self.p_type, self.stacking_alg, self.stacking_hyperparams) self.fit(x_tr, y_tr, rank=k, runtime_limit=t) loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type) # TEMPORARY: Record intermediate results e_hat.append(np.copy(self.new_row)) actual_times.append(time.time() - start) sampled.append(self.sampled_indices) ensembles.append(self.ensemble) losses.append(loss) if loss == min(losses): ranks.append(k + 1) best = counter else: ranks.append(k) times.append(2 * t) k = ranks[-1] t = times[-1] counter += 1 # after all iterations, restore best model self.new_row = e_hat[best] self.ensemble = ensembles[best] return { 'ranks': ranks[:-1], 'runtime_limits': times[:-1], 'validation_loss': losses, 'predicted_new_row': e_hat, 'actual_runtimes': actual_times, 'sampled_indices': sampled, 'models': ensembles } def refit(self, x_train, y_train): """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. """ assert self.ensemble.fitted, "Cannot refit unless model has been fit." self.ensemble.fit(x_train, y_train) def predict(self, x_test): """Generate predictions on test data. Args: x_test (np.ndarray): Features of the test dataset. Returns: np.ndarray: Predicted labels. """ return self.ensemble.predict(x_test)
def fit_doubling(self, x_train, y_train, verbose=False): """Fit an AutoLearner object, iteratively doubling allowed runtime.""" t_predicted = convex_opt.predict_runtime(x_train.shape) # split data into training and validation sets try: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, stratify=y_train, random_state=0) except ValueError: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, random_state=0) ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)] t_init = 2**np.floor( np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum())) t_init = max(1, t_init) times = [t_init] losses = [1.0] e_hat, actual_times, sampled, ensembles = [], [], [], [] k, t = ranks[0], times[0] start = time.time() counter, best = 0, 0 while time.time() - start < self.runtime_limit - t: if verbose: print('Fitting with k={}, t={}'.format(k, t)) t0 = time.time() self.ensemble = Ensemble(self.p_type, self.stacking_alg, self.stacking_hyperparams) self.fit(x_tr, y_tr, rank=k, runtime_limit=t) loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type) # TEMPORARY: Record intermediate results e_hat.append(np.copy(self.new_row)) actual_times.append(time.time() - start) sampled.append(self.sampled_indices) ensembles.append(self.ensemble) losses.append(loss) if loss == min(losses): ranks.append(k + 1) best = counter else: ranks.append(k) times.append(2 * t) k = ranks[-1] t = times[-1] counter += 1 # after all iterations, restore best model self.new_row = e_hat[best] self.ensemble = ensembles[best] return { 'ranks': ranks[:-1], 'runtime_limits': times[:-1], 'validation_loss': losses, 'predicted_new_row': e_hat, 'actual_runtimes': actual_times, 'sampled_indices': sampled, 'models': ensembles }
def multiple_models(data_hhb, data_hbo2, data_na, data_ca, Ytrain, Xtest): print('Fit the models for ensemable') model1 = Ensemble() model2 = Ensemble() model3 = Ensemble() model4 = Ensemble() model1.fit(data_hhb, Ytrain.hhb) model2.fit(data_hbo2, Ytrain.hbo2) model3.fit(data_na, Ytrain.na) model4.fit(data_ca, Ytrain.ca) print('Predict the target values') tdata = DataSet(Xtest) tdata_hhb, tdata_hbo2, tdata_na, tdata_ca, _ = tdata.data_proccessing( False) hhb = model1.predict(tdata_hhb) hbo2 = model2.predict(tdata_hbo2) na = model3.predict(tdata_na) ca = model4.predict(tdata_ca) na = np.exp(na - 1.5) - 2 return np.concatenate([ hhb.reshape((-1, 1)), hbo2.reshape((-1, 1)), ca.reshape((-1, 1)), na.reshape((-1, 1)) ], axis=1)
Classifier(processors2[0], head2), Classifier(processors2[1], frozenhead2), Classifier(processors2[2], frozenhead2), Classifier(processors2[3], frozenhead2) ] # testing sets loader = torch.utils.data.DataLoader(test_set, batch_size=2**31, shuffle=False) for test_images1, test_labels1, _ in load(loader, digits1): pass loader = torch.utils.data.DataLoader(test_set, batch_size=2**31, shuffle=False) for test_images2, test_labels2, _ in load(loader, digits2): pass # those two ensembles are utilized for predictions, not training ensemble1 = Ensemble(processors1, frozenhead1) ensemble2 = Ensemble(processors2, frozenhead2) # training loss_func = nn.BCEWithLogitsLoss() rounds = [] for digits, classifiers in [(digits1, classifiers1), (digits2, classifiers2)]: optimizers = [ torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) for model in classifiers ] for epoch in tqdm(range(3), desc="digits %s" % digits, total=3): loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True) for images, _, y_true in load(loader, digits): for optimizer, model in zip(optimizers, classifiers): y_pred = model(images)
def main(args: argparse.Namespace): if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True cudnn.benchmark = True # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = transforms.Compose([ ResizeImage(256), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) val_tranform = transforms.Compose([ ResizeImage(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]) a, b, c = args.n_share, args.n_source_private, args.n_total common_classes = [i for i in range(a)] source_private_classes = [i + a for i in range(b)] target_private_classes = [i + a + b for i in range(c - a - b)] source_classes = common_classes + source_private_classes target_classes = common_classes + target_private_classes dataset = datasets.Office31 train_source_dataset = dataset(root=args.root, data_list_file=args.source, filter_class=source_classes, transform=train_transform) train_source_loader = DataLoader(train_source_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, drop_last=True) train_target_dataset = dataset(root=args.root, data_list_file=args.target, filter_class=target_classes, transform=train_transform) train_target_loader = DataLoader(train_target_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, drop_last=True) val_dataset = dataset(root=args.root, data_list_file=args.target, filter_class=target_classes, transform=val_tranform) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) test_loader = val_loader train_source_iter = ForeverDataIterator(train_source_loader) train_target_iter = ForeverDataIterator(train_target_loader) esem_iter1, esem_iter2, esem_iter3, esem_iter4, esem_iter5 = esem_dataloader( args, source_classes) # create model backbone = resnet50(pretrained=True) classifier = ImageClassifier(backbone, train_source_dataset.num_classes).to(device) domain_discri = DomainDiscriminator(in_feature=classifier.features_dim, hidden_size=1024).to(device) esem = Ensemble(classifier.features_dim, train_source_dataset.num_classes).to(device) # define optimizer and lr scheduler optimizer = SGD(classifier.get_parameters() + domain_discri.get_parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) lr_scheduler = StepwiseLR(optimizer, init_lr=args.lr, gamma=0.001, decay_rate=0.75) optimizer_esem = SGD(esem.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) lr_scheduler1 = StepwiseLR(optimizer_esem, init_lr=args.lr, gamma=0.001, decay_rate=0.75) lr_scheduler2 = StepwiseLR(optimizer_esem, init_lr=args.lr, gamma=0.001, decay_rate=0.75) lr_scheduler3 = StepwiseLR(optimizer_esem, init_lr=args.lr, gamma=0.001, decay_rate=0.75) lr_scheduler4 = StepwiseLR(optimizer_esem, init_lr=args.lr, gamma=0.001, decay_rate=0.75) lr_scheduler5 = StepwiseLR(optimizer_esem, init_lr=args.lr, gamma=0.001, decay_rate=0.75) optimizer_pre = SGD(esem.get_parameters() + classifier.get_parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) # define loss function domain_adv = DomainAdversarialLoss(domain_discri, reduction='none').to(device) pretrain(esem_iter1, esem_iter2, esem_iter3, esem_iter4, esem_iter5, classifier, esem, optimizer_pre, args) # start training best_acc1 = 0. for epoch in range(args.epochs): # train for one epoch train_esem(esem_iter1, classifier, esem, optimizer_esem, lr_scheduler1, epoch, args, index=1) train_esem(esem_iter2, classifier, esem, optimizer_esem, lr_scheduler2, epoch, args, index=2) train_esem(esem_iter3, classifier, esem, optimizer_esem, lr_scheduler3, epoch, args, index=3) train_esem(esem_iter4, classifier, esem, optimizer_esem, lr_scheduler4, epoch, args, index=4) train_esem(esem_iter5, classifier, esem, optimizer_esem, lr_scheduler5, epoch, args, index=5) source_class_weight = evaluate_source_common(val_loader, classifier, esem, source_classes, args) train(train_source_iter, train_target_iter, classifier, domain_adv, esem, optimizer, lr_scheduler, epoch, source_class_weight, args) # evaluate on validation set acc1 = validate(val_loader, classifier, esem, source_classes, args) # remember best acc@1 and save checkpoint if acc1 > best_acc1: best_model = copy.deepcopy(classifier.state_dict()) best_acc1 = max(acc1, best_acc1) print("best_acc1 = {:3.3f}".format(best_acc1)) # evaluate on test set classifier.load_state_dict(best_model) acc1 = validate(test_loader, classifier, esem, source_classes, args) print("test_acc1 = {:3.3f}".format(acc1))
class AutoLearner: """An object representing an automatically tuned machine learning model. Attributes: p_type (str): Problem type. One of {'classification', 'regression'}. algorithms (list): A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM']). hyperparameters (dict): A nested dict of hyperparameters to be considered; see above for example. verbose (bool): Whether or not to generate print statements when a model finishes fitting. n_cores (int): Maximum number of cores over which to parallelize (None means no limit). runtime_limit(int): Maximum training time for AutoLearner (powers of 2 preferred). selection_method (str): Method of selecting entries of new row to sample. scalarization (str): Scalarization of objective for min-variance selection. Either 'D' or 'A'. error_matrix (DataFrame): Error matrix to use for imputation; includes index and headers. runtime_matrix (DataFrame): Runtime matrix to use for runtime prediction; includes index and headers. column_headings (list): Column headings of error/runtime matrices; list of dicts. X, Y (np.ndarray): PCA decomposition of error matrix. new_row (np.ndarray): Predicted row of error matrix. sampled_indices (set): Indices of new row that have been sampled. sampled_models (list): List of models that have been sampled (i.e. k-fold fitted). fitted_indices (set): Indices of new row that have been fitted (i.e. included in ensemble) fitted_models (list): List of models that have been fitted. stacking_alg (str): Algorithm type to use for stacked learner. """ def __init__(self, p_type, algorithms=None, hyperparameters=None, verbose=False, n_cores=mp.cpu_count(), runtime_limit=512, selection_method='min_variance', scalarization='D', error_matrix=None, runtime_matrix=None, stacking_alg='greedy', **stacking_hyperparams): # TODO: check if arguments to constructor are valid; set to defaults if not specified assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \ "either qr (QR decomposition) or min_variance (minimize variance with time constraints)." with open(os.path.join(DEFAULTS, p_type + '.json')) as file: defaults = json.load(file) # attributes of ML problem self.p_type = p_type.lower() self.algorithms = algorithms or defaults['algorithms'] self.hyperparameters = hyperparameters or defaults['hyperparameters'] self.verbose = verbose # computational considerations self.n_cores = n_cores self.runtime_limit = runtime_limit # sample column selection self.selection_method = selection_method self.scalarization = scalarization # error matrix attributes # TODO: determine whether to generate new error matrix or use default/subset of default self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix assert util.check_dataframes(self.error_matrix, self.runtime_matrix) self.column_headings = np.array( [eval(heading) for heading in list(self.error_matrix)]) self.X, self.Y, _ = linalg.pca(self.error_matrix.values, rank=min(self.error_matrix.shape) - 1) # sampled & fitted models self.new_row = np.zeros((1, self.error_matrix.shape[1])) self.sampled_indices = set() self.sampled_models = [None] * self.error_matrix.shape[1] self.fitted_indices = set() self.fitted_models = [None] * self.error_matrix.shape[1] # ensemble attributes self.stacking_alg = stacking_alg self.stacking_hyperparams = stacking_hyperparams self.ensemble = Ensemble(self.p_type, self.stacking_alg, self.stacking_hyperparams) def fit(self, x_train, y_train, rank=None, runtime_limit=None): """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then construct an optimal ensemble model. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. rank (int): Rank of error matrix factorization. runtime_limit (float): Maximum time to allocate to AutoLearner fitting. """ # set to defaults if not provided rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01) runtime_limit = runtime_limit or self.runtime_limit if self.verbose: print('Fitting AutoLearner with max. runtime {}s'.format( runtime_limit)) t_predicted = convex_opt.predict_runtime( x_train.shape, runtime_matrix=self.runtime_matrix) if self.selection_method == 'qr': to_sample = linalg.pivot_columns(self.error_matrix) elif self.selection_method == 'min_variance': # select algorithms to sample only from subset of algorithms that will run in allocated time valid = np.where( t_predicted <= self.n_cores * runtime_limit / 2)[0] Y = self.Y[:rank, valid] # TODO: check if Y is rank-deficient, i.e. will ED problem fail? v_opt = convex_opt.solve(t_predicted[valid], runtime_limit / 4, self.n_cores, Y, self.scalarization) to_sample = valid[np.where(v_opt > 0.9)[0]] if np.isnan(to_sample).any(): to_sample = np.argsort(t_predicted)[:rank] else: to_sample = np.arange(0, self.new_row.shape[1]) if len(to_sample) == 0 and len(self.sampled_indices) == 0: # if no columns are selected in first iteration (log det instability), sample n fastest columns n = len( np.where(np.cumsum(np.sort(t_predicted)) <= runtime_limit)[0]) to_sample = np.argsort(t_predicted)[:n] # only need to compute column entry if it has not been computed already to_sample = list(set(to_sample) - self.sampled_indices) if self.verbose: print('Sampling {} entries of new row...'.format(len(to_sample))) start = time.time() p1 = mp.Pool(self.n_cores) sample_models = [ Model(self.p_type, self.column_headings[i]['algorithm'], self.column_headings[i]['hyperparameters'], self.verbose, i) for i in to_sample ] sample_model_errors = [ p1.apply_async(Model.kfold_fit_validate, args=[m, x_train, y_train, 5]) for m in sample_models ] p1.close() p1.join() # update sampled indices self.sampled_indices = self.sampled_indices.union(set(to_sample)) for i, error in enumerate(sample_model_errors): cv_error, cv_predictions = error.get() sample_models[i].cv_error, sample_models[ i].cv_predictions = cv_error.mean(), cv_predictions sample_models[i].sampled = True self.new_row[:, to_sample[i]] = cv_error.mean() self.sampled_models[to_sample[i]] = sample_models[i] imputed = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank) # impute ALL entries # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices)) # self.new_row[:, unknown] = imputed[:, unknown] self.new_row = imputed # k-fold fit candidate learners of ensemble remaining = (runtime_limit - (time.time() - start)) * self.n_cores # add best sampled model to list of candidate learners to avoid empty lists best_sampled_idx = list(self.sampled_indices)[int( np.argmin(self.new_row[:, list(self.sampled_indices)]))] assert self.sampled_models[best_sampled_idx] is not None candidate_indices = [best_sampled_idx] self.ensemble.candidate_learners.append( self.sampled_models[best_sampled_idx]) for i in np.argsort(self.new_row[0]): if t_predicted[i] + t_predicted[candidate_indices].sum( ) <= remaining: last = candidate_indices.pop() assert last == best_sampled_idx candidate_indices.append(i) candidate_indices.append(last) # if model has already been k-fold fitted, immediately add to candidate learners if i in self.sampled_indices: assert self.sampled_models[i] is not None self.ensemble.candidate_learners.append( self.sampled_models[i]) # candidate learners that need to be k-fold fitted to_fit = list(set(candidate_indices) - self.sampled_indices) p2 = mp.Pool(self.n_cores) candidate_models = [ Model(self.p_type, self.column_headings[i]['algorithm'], self.column_headings[i]['hyperparameters'], self.verbose, i) for i in to_fit ] candidate_model_errors = [ p2.apply_async(Model.kfold_fit_validate, args=[m, x_train, y_train, 5]) for m in candidate_models ] p2.close() p2.join() # update sampled indices self.sampled_indices = self.sampled_indices.union(set(to_fit)) for i, error in enumerate(candidate_model_errors): cv_error, cv_predictions = error.get() candidate_models[i].cv_error, candidate_models[ i].cv_predictions = cv_error.mean(), cv_predictions candidate_models[i].sampled = True self.new_row[:, to_fit[i]] = cv_error.mean() self.sampled_models[to_fit[i]] = candidate_models[i] self.ensemble.candidate_learners.append(candidate_models[i]) # self.new_row = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank) if self.verbose: print('\nFitting ensemble of max. size {}...'.format( len(self.ensemble.candidate_learners))) self.ensemble.fit(x_train, y_train, remaining, self.fitted_models) for model in self.ensemble.base_learners: assert model.index is not None self.fitted_indices.add(model.index) self.fitted_models[model.index] = model self.ensemble.fitted = True if self.verbose: print('\nAutoLearner fitting complete.') def fit_doubling(self, x_train, y_train, verbose=False): """Fit an AutoLearner object, iteratively doubling allowed runtime.""" t_predicted = convex_opt.predict_runtime(x_train.shape) # split data into training and validation sets try: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, stratify=y_train, random_state=0) except ValueError: x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.15, random_state=0) ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)] t_init = 2**np.floor( np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum())) t_init = max(1, t_init) times = [t_init] losses = [1.0] e_hat, actual_times, sampled, ensembles = [], [], [], [] k, t = ranks[0], times[0] start = time.time() counter, best = 0, 0 while time.time() - start < self.runtime_limit - t: if verbose: print('Fitting with k={}, t={}'.format(k, t)) t0 = time.time() self.ensemble = Ensemble(self.p_type, self.stacking_alg, self.stacking_hyperparams) self.fit(x_tr, y_tr, rank=k, runtime_limit=t) loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type) # TEMPORARY: Record intermediate results e_hat.append(np.copy(self.new_row)) actual_times.append(time.time() - start) sampled.append(self.sampled_indices) ensembles.append(self.ensemble) losses.append(loss) if loss == min(losses): ranks.append(k + 1) best = counter else: ranks.append(k) times.append(2 * t) k = ranks[-1] t = times[-1] counter += 1 # after all iterations, restore best model self.new_row = e_hat[best] self.ensemble = ensembles[best] return { 'ranks': ranks[:-1], 'runtime_limits': times[:-1], 'validation_loss': losses, 'predicted_new_row': e_hat, 'actual_runtimes': actual_times, 'sampled_indices': sampled, 'models': ensembles } def refit(self, x_train, y_train): """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal. Args: x_train (np.ndarray): Features of the training dataset. y_train (np.ndarray): Labels of the training dataset. """ assert self.ensemble.fitted, "Cannot refit unless model has been fit." self.ensemble.fit(x_train, y_train) def predict(self, x_test): """Generate predictions on test data. Args: x_test (np.ndarray): Features of the test dataset. Returns: np.ndarray: Predicted labels. """ return self.ensemble.predict(x_test)