def fit(self, hyperparameter_config, pipeline_config, train_loader, valid_loader, network, optimizer, optimize_metric, additional_metrics, log_functions, budget, loss_function, training_techniques, fit_start_time, refit): hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) logger = logging.getLogger('autonet') logger.debug("Start train. Budget: " + str(budget)) if pipeline_config["torch_num_threads"] > 0: torch.set_num_threads(pipeline_config["torch_num_threads"]) trainer = Trainer( model=network, loss_computation=self.batch_loss_computation_techniques[ hyperparameter_config["batch_loss_computation_technique"]](), metrics=[optimize_metric] + additional_metrics, log_functions=log_functions, criterion=loss_function, budget=budget, optimizer=optimizer, training_techniques=training_techniques, device=Trainer.get_device(pipeline_config), logger=logger, full_eval_each_epoch=pipeline_config["full_eval_each_epoch"]) trainer.prepare(pipeline_config, hyperparameter_config, fit_start_time) logs = trainer.model.logs epoch = trainer.model.epochs_trained training_start_time = time.time() while True: # prepare epoch log = dict() trainer.on_epoch_start(log=log, epoch=epoch) # training optimize_metric_results, train_loss, stop_training = trainer.train( epoch + 1, train_loader) if valid_loader is not None and trainer.eval_valid_each_epoch: valid_metric_results = trainer.evaluate(valid_loader) # evaluate log['loss'] = train_loss for i, metric in enumerate(trainer.metrics): log['train_' + metric.name] = optimize_metric_results[i] if valid_loader is not None and trainer.eval_valid_each_epoch: log['val_' + metric.name] = valid_metric_results[i] if trainer.eval_additional_logs_each_epoch: for additional_log in trainer.log_functions: log[additional_log.name] = additional_log( trainer.model, epoch) # wrap up epoch stop_training = trainer.on_epoch_end(log=log, epoch=epoch) or stop_training # handle logs logs.append(log) log = { key: value for key, value in log.items() if not isinstance(value, np.ndarray) } logger.debug("Epoch: " + str(epoch) + " : " + str(log)) if 'use_tensorboard_logger' in pipeline_config and pipeline_config[ 'use_tensorboard_logger']: self.tensorboard_log(budget=budget, epoch=epoch, log=log) if stop_training: break epoch += 1 torch.cuda.empty_cache() # wrap up loss, final_log = self.wrap_up_training( trainer=trainer, logs=logs, epoch=epoch, train_loader=train_loader, valid_loader=valid_loader, budget=budget, training_start_time=training_start_time, fit_start_time=fit_start_time, best_over_epochs=pipeline_config['best_over_epochs'], refit=refit, logger=logger) return {'loss': loss, 'info': final_log}
def fit(self, hyperparameter_config, pipeline_config, train_loader, valid_loader, network, optimizer, lr_scheduler, optimize_metric, additional_metrics, log_functions, budget, loss_function, budget_type, config_id, working_directory, train_indices, valid_indices): if budget < 1e-5: return { 'loss': float('inf') if pipeline_config["minimize"] else -float('inf'), 'info': dict() } training_start_time = time.time() # prepare if not torch.cuda.is_available(): pipeline_config["cuda"] = False device = torch.device('cuda' if pipeline_config['cuda'] else 'cpu') checkpoint_path = get_checkpoint_dir(working_directory) checkpoint = None if pipeline_config['save_checkpoints']: checkpoint = load_checkpoint(checkpoint_path, config_id, budget) network = load_model(network, checkpoint) tensorboard_logging = 'use_tensorboard_logger' in pipeline_config and pipeline_config[ 'use_tensorboard_logger'] # from torch.optim import SGD # optimizer = SGD(network.parameters(), lr=0.3) # optimizer = load_optimizer(optimizer, checkpoint, device) # lr_scheduler = load_scheduler(lr_scheduler, checkpoint) hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) batch_loss_name = hyperparameter_config[ "batch_loss_computation_technique"] if "batch_loss_computation_technique" in hyperparameter_config else pipeline_config[ "batch_loss_computation_techniques"][0] batch_loss_computation_technique = self.batch_loss_computation_techniques[ batch_loss_name]() batch_loss_computation_technique.set_up( pipeline_config, ConfigWrapper(batch_loss_name, hyperparameter_config), self.logger) # Training loop logs = [] epoch = 0 optimize_metrics = [] val_metrics = [optimize_metric] + additional_metrics if pipeline_config['evaluate_on_train_data']: optimize_metrics = val_metrics elif valid_loader is None: self.logger.warning( 'No valid data specified and train process should not evaluate on train data! Will ignore \"evaluate_on_train_data\" and evaluate on train data!' ) optimize_metrics = val_metrics trainer = Trainer(model=network, loss_computation=batch_loss_computation_technique, criterion=loss_function, budget=budget, optimizer=optimizer, scheduler=lr_scheduler, budget_type=budget_type, device=device, config_id=config_id, checkpoint_path=checkpoint_path if pipeline_config['save_checkpoints'] else None, images_to_plot=tensorboard_logging * pipeline_config['tensorboard_images_count']) model_params = self.count_parameters(network) start_up = time.time() - training_start_time epoch_train_time = 0 val_time = 0 log_time = 0 # tmp = time.time() # for _ in range(100): # for _ in train_loader: # pass # time_used = time.time() - tmp # self.logger.debug("Test time: " + str(time_used) + "s : \n" + str(pprint.pformat(train_loader.dataset.get_times('train_')))) self.logger.debug("Start train. Budget: " + str(budget)) last_log_time = time.time() while True: # prepare epoch log = dict() # train tmp = time.time() optimize_metric_results, train_loss, stop_training = trainer.train( epoch + 1, train_loader, optimize_metrics) log['train_loss'] = train_loss for i, metric in enumerate(optimize_metrics): log['train_' + metric.name] = optimize_metric_results[i] epoch_train_time += time.time() - tmp # evaluate tmp = time.time() if valid_loader is not None: valid_metric_results = trainer.evaluate(valid_loader, val_metrics, epoch=epoch + 1) for i, metric in enumerate(val_metrics): log['val_' + metric.name] = valid_metric_results[i] val_time += time.time() - tmp # additional los - e.g. test evaluation tmp = time.time() for func in log_functions: log[func.name] = func(network, epoch + 1) log_time += time.time() - tmp log['epochs'] = epoch + 1 log['model_parameters'] = model_params log['learning_rate'] = optimizer.param_groups[0]['lr'] # log.update(train_loader.dataset.get_times('train_')) # log.update(valid_loader.dataset.get_times('val_')) logs.append(log) epoch += 1 self.logger.debug("Epoch: " + str(epoch) + " : " + str(log)) if budget_type == 'epochs' and epoch + 1 >= budget: break if stop_training: break if tensorboard_logging and time.time( ) - last_log_time >= pipeline_config[ 'tensorboard_min_log_interval']: import tensorboard_logger as tl worker_path = 'Train/' tl.log_value(worker_path + 'budget', float(budget), epoch) for name, value in log.items(): tl.log_value(worker_path + name, float(value), epoch) last_log_time = time.time() # wrap up wrap_up_start_time = time.time() self.logger.debug("Finished Training") opt_metric_name = 'train_' + optimize_metric.name if valid_loader is not None: opt_metric_name = 'val_' + optimize_metric.name if pipeline_config["minimize"]: final_log = min(logs, key=lambda x: x[opt_metric_name]) else: final_log = max(logs, key=lambda x: x[opt_metric_name]) if tensorboard_logging: import tensorboard_logger as tl worker_path = 'Train/' tl.log_value(worker_path + 'budget', float(budget), epoch) for name, value in final_log.items(): tl.log_value(worker_path + name, float(value), epoch) if trainer.latest_checkpoint: final_log['checkpoint'] = trainer.latest_checkpoint elif pipeline_config['save_checkpoints']: path = save_checkpoint(checkpoint_path, config_id, budget, network, optimizer, lr_scheduler) final_log['checkpoint'] = path final_log['train_datapoints'] = len(train_indices) if valid_loader is not None: final_log['val_datapoints'] = len(valid_indices) loss = final_log[opt_metric_name] * (1 if pipeline_config["minimize"] else -1) self.logger.info("Finished train with budget " + str(budget) + "s, Training took " + str(int(wrap_up_start_time - training_start_time)) + "s, Wrap up took " + str(int(time.time() - wrap_up_start_time)) + "s, Init took " + str(int(start_up)) + "s, Train took " + str(int(epoch_train_time)) + "s, Validation took " + str(int(val_time)) + "s, Log functions took " + str(int(log_time)) + "s, Cumulative time " + str(int(trainer.cumulative_time)) + "s.\nTotal time consumption in s: " + str(int(time.time() - training_start_time))) return {'loss': loss, 'info': final_log}
def fit(self, hyperparameter_config, pipeline_config, X_train, Y_train, X_valid, Y_valid, network, optimizer, train_metric, additional_metrics, log_functions, budget, loss_function, training_techniques, fit_start_time): # prepare if not torch.cuda.is_available(): pipeline_config["cuda"] = False hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) training_techniques = [t() for t in self.training_techniques.values()] + training_techniques training_components, train_data, X_train, Y_train, X_valid, Y_valid, eval_specifics = prepare_training( pipeline_config=pipeline_config, hyperparameter_config=hyperparameter_config, training_techniques=training_techniques, batch_loss_computation_technique=self.batch_loss_computation_techniques[hyperparameter_config["batch_loss_computation_technique"]](), X_train=X_train, Y_train=Y_train, X_valid=X_valid, Y_valid=Y_valid, batch_size=hyperparameter_config["batch_size"], network=network, optimizer=optimizer, loss_function=loss_function, train_metric=train_metric, additional_metrics=additional_metrics, log_functions=log_functions, budget=budget, logger=self.logger, fit_start_time=fit_start_time) self.logger.debug("Start train. Budget: " + str(budget)) # Training loop logs = network.logs epoch = network.epochs_trained run_training = True training_start_time = time.time() while run_training: # prepare epoch log = dict() for t in training_techniques: t.before_train_batches(training_components, log, epoch) # train and eval log['loss'] = _train_batches(train_data, training_components, training_techniques) _eval_metrics(eval_specifics=eval_specifics["after_epoch"], hyperparameter_config=hyperparameter_config, pipeline_config=pipeline_config, training_components=training_components, X_train=X_train, Y_train=Y_train, X_valid=X_valid, Y_valid=Y_valid, log=log, epoch=epoch, budget=budget) # check if finished and apply training techniques run_training = not any([t.after_train_batches(training_components, log, epoch) for t in training_techniques]) # handle logs logs.append(log) # update_logs(t, budget, log, 5, epoch + 1, verbose, True) self.logger.debug("Epoch: " + str(epoch) + " : " + str(log)) epoch += 1 # wrap up wrap_up_start_time = time.time() network.epochs_trained = epoch network.logs = logs final_log, loss_value = wrap_up_training(pipeline_config=pipeline_config, hyperparameter_config=hyperparameter_config, eval_specifics=eval_specifics["after_training"], training_techniques=training_techniques, training_components=training_components, logs=logs, X_train=X_train, Y_train=Y_train, X_valid=X_valid, Y_valid=Y_valid, epoch=epoch, budget=budget) self.logger.debug("Finished train! Loss: " + str(loss_value) + " : " + str(final_log)) self.logger.info("Finished train with budget " + str(budget) + ": Preprocessing took " + str(int(training_start_time - fit_start_time)) + "s, Training took " + str(int(wrap_up_start_time - training_start_time)) + "s, Wrap up took " + str(int(time.time() - wrap_up_start_time)) + "s. Total time consumption in s: " + str(int(time.time() - fit_start_time))) return {'loss': loss_value, 'info': final_log}
def fit(self, hyperparameter_config, pipeline_config, train_loader, valid_loader, network, optimizer, optimize_metric, additional_metrics, log_functions, budget, loss_function, training_techniques, fit_start_time, refit, hyperparameter_config_id): """Train the network. Arguments: hyperparameter_config {dict} -- The sampled hyperparameter config. pipeline_config {dict} -- The user specified configuration of the pipeline train_loader {DataLoader} -- Data for training. valid_loader {DataLoader} -- Data for validation. network {BaseNet} -- The neural network to be trained. optimizer {AutoNetOptimizerBase} -- The selected optimizer. optimize_metric {AutoNetMetric} -- The selected metric to optimize additional_metrics {list} -- List of metrics, that should be logged log_functions {list} -- List of AutoNetLofFunctions that can log additional stuff like test performance budget {float} -- The budget for training loss_function {_Loss} -- The selected PyTorch loss module training_techniques {list} -- List of objects inheriting from BaseTrainingTechnique. fit_start_time {float} -- Start time of fit refit {bool} -- Whether training for refit or not. Returns: dict -- loss and info reported to bohb """ self.hyperparameter_config_id = hyperparameter_config_id self.pipeline_config = pipeline_config self.budget = budget hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) logger = logging.getLogger('autonet') logger.debug("Start train. Budget: " + str(budget)) if pipeline_config["torch_num_threads"] > 0: torch.set_num_threads(pipeline_config["torch_num_threads"]) trainer = Trainer( model=network, loss_computation=self.batch_loss_computation_techniques[hyperparameter_config["batch_loss_computation_technique"]](), metrics=[optimize_metric] + additional_metrics, log_functions=log_functions, criterion=loss_function, budget=budget, optimizer=optimizer, training_techniques=training_techniques, device=Trainer.get_device(pipeline_config), logger=logger, full_eval_each_epoch=pipeline_config["full_eval_each_epoch"]) trainer.prepare(pipeline_config, hyperparameter_config, fit_start_time) model_params = self.count_parameters(network) logs = trainer.model.logs epoch = trainer.model.epochs_trained training_start_time = time.time() while True: # prepare epoch log = dict() trainer.on_epoch_start(log=log, epoch=epoch) # training optimize_metric_results, train_loss, stop_training = trainer.train(epoch + 1, train_loader) if valid_loader is not None and trainer.eval_valid_each_epoch: valid_metric_results = trainer.evaluate(valid_loader) # evaluate log['loss'] = train_loss log['model_parameters'] = model_params for i, metric in enumerate(trainer.metrics): log['train_' + metric.name] = optimize_metric_results[i] if valid_loader is not None and trainer.eval_valid_each_epoch: log['val_' + metric.name] = valid_metric_results[i] if trainer.eval_additional_logs_each_epoch: for additional_log in trainer.log_functions: log[additional_log.name] = additional_log(trainer.model, epoch) # wrap up epoch stop_training = trainer.on_epoch_end(log=log, epoch=epoch) or stop_training # handle logs logs.append(log) log = {key: value for key, value in log.items() if not isinstance(value, np.ndarray)} logger.debug("Epoch: " + str(epoch) + " : " + str(log)) if 'use_tensorboard_logger' in pipeline_config and pipeline_config['use_tensorboard_logger']: self.tensorboard_log(budget=budget, epoch=epoch, log=log, logdir=pipeline_config["result_logger_dir"]) if stop_training: break epoch += 1 torch.cuda.empty_cache() # wrap up loss, final_log = self.wrap_up_training(trainer=trainer, logs=logs, epoch=epoch, train_loader=train_loader, valid_loader=valid_loader, budget=budget, training_start_time=training_start_time, fit_start_time=fit_start_time, best_over_epochs=pipeline_config['best_over_epochs'], refit=refit, logger=logger) return {'loss': loss, 'info': final_log}
def fit(self, pipeline_config, hyperparameter_config, X, Y, train_indices, valid_indices, train_transform, valid_transform, dataset_info): # if len(X.shape) > 1: # return super(CreateImageDataLoader, self).fit(pipeline_config, hyperparameter_config, X, Y, train_indices, valid_indices) torch.manual_seed(pipeline_config["random_seed"]) hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) to_int64 = transform_int64 if dataset_info.default_dataset: train_dataset = dataset_info.default_dataset( root=pipeline_config['default_dataset_download_dir'], train=True, download=True, transform=train_transform) if valid_indices is not None: valid_dataset = dataset_info.default_dataset( root=pipeline_config['default_dataset_download_dir'], train=True, download=True, transform=valid_transform) elif len(X.shape) > 1: train_dataset = XYDataset(X, Y, transform=train_transform, target_transform=to_int64) valid_dataset = XYDataset(X, Y, transform=valid_transform, target_transform=to_int64) else: train_dataset = ImageFilelist( X, Y, transform=train_transform, target_transform=to_int64, cache_size=pipeline_config['dataloader_cache_size_mb'] * 1000, image_size=dataset_info.x_shape[2:]) if valid_indices is not None: valid_dataset = ImageFilelist( X, Y, transform=valid_transform, target_transform=to_int64, cache_size=0, image_size=dataset_info.x_shape[2:]) valid_dataset.cache = train_dataset.cache train_loader = DataLoader( dataset=train_dataset, batch_size=int(hyperparameter_config['batch_size']), sampler=SubsetRandomSampler(train_indices), drop_last=True, pin_memory=True, num_workers=pipeline_config['dataloader_worker']) valid_loader = None if valid_indices is not None: valid_loader = DataLoader( dataset=valid_dataset, batch_size=int(hyperparameter_config['batch_size']), sampler=SubsetRandomSampler(valid_indices), drop_last=False, pin_memory=True, num_workers=pipeline_config['dataloader_worker']) return { 'train_loader': train_loader, 'valid_loader': valid_loader, 'batch_size': hyperparameter_config['batch_size'] }
def fit(self, pipeline_config, hyperparameter_config, dataset_info, X, Y, train_indices, valid_indices): mean, std = self.compute_mean_std( pipeline_config, hyperparameter_config, X, Y, train_indices, dataset_info) #dataset_info.mean, dataset_info.std hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) transform_list = [] image_size = min(dataset_info.x_shape[-2], dataset_info.x_shape[-1]) if len(X.shape) > 1: transform_list.append(transforms.ToPILImage()) if hyperparameter_config['augment']: if hyperparameter_config[ 'fastautoaugment'] and hyperparameter_config['autoaugment']: # fast autoaugment and autoaugment transform_list.extend([ FastAutoAugment(), AutoAugment(), transforms.Resize(image_size), transforms.RandomCrop(image_size, padding=4), transforms.RandomHorizontalFlip() ]) elif hyperparameter_config['fastautoaugment']: # fast autoaugment transform_list.extend([ FastAutoAugment(), transforms.Resize(image_size), transforms.RandomCrop(image_size, padding=4), transforms.RandomHorizontalFlip() ]) elif hyperparameter_config['autoaugment']: # autoaugment transform_list.extend([ AutoAugment(), transforms.Resize(image_size), transforms.RandomCrop(image_size, padding=4), transforms.RandomHorizontalFlip() ]) else: # default augment color, rotation, size transform_list.extend([ transforms.ColorJitter(brightness=0.196, saturation=0.196, hue=0.141), transforms.RandomAffine(degrees=10, shear=0.1, fillcolor=127), transforms.RandomResizedCrop(image_size, scale=(0.533, 1), ratio=(0.75, 1.25)), transforms.RandomHorizontalFlip() ]) else: transform_list.extend([ transforms.Resize(image_size), transforms.CenterCrop(image_size), ]) # grayscale if only one channel if dataset_info.x_shape[1] == 1: transform_list.append(transforms.Grayscale(1)) # normalize transform_list.append(transforms.ToTensor()) transform_list.append(transforms.Normalize(mean, std)) # cutout if hyperparameter_config['cutout']: n_holes = hyperparameter_config['cutout_holes'] transform_list.append( Cutout(n_holes=1, length=hyperparameter_config['length'], probability=0.5)) train_transform = transforms.Compose(transform_list) transform_list = [] if len(X.shape) > 1: transform_list.append(transforms.ToPILImage()) transform_list.extend([ transforms.Resize(image_size), transforms.CenterCrop(image_size), transforms.ToTensor(), transforms.Normalize(mean, std), ]) valid_transform = transforms.Compose( [transforms.Grayscale(1)] + transform_list if dataset_info.x_shape[1] == 1 else transform_list) return { 'train_transform': train_transform, 'valid_transform': valid_transform, 'mean': mean, 'std': std }
def prepare_training(pipeline_config, hyperparameter_config, training_techniques, batch_loss_computation_technique, X_train, Y_train, X_valid, Y_valid, batch_size, network, optimizer, loss_function, train_metric, additional_metrics, log_functions, budget, logger, fit_start_time): """ Prepare the data and components for training""" torch.manual_seed(pipeline_config["random_seed"]) device = torch.device('cuda:0' if pipeline_config['cuda'] else 'cpu') if pipeline_config['cuda']: logger.debug('Running on the GPU using CUDA.') else: logger.debug( 'Not running on GPU as CUDA is either disabled or not available. Running on CPU instead.' ) # initialize training techniques and training components batch_loss_computation_technique.set_up( pipeline_config, ConfigWrapper( hyperparameter_config["batch_loss_computation_technique"], hyperparameter_config), logger) training_components = { "network": network.to(device), "optimizer": optimizer, "loss_function": loss_function.to(device), "metrics": [train_metric] + additional_metrics, "train_metric_name": train_metric.__name__, "log_functions": log_functions, "device": device, "initial_budget": network.budget_trained, "budget": budget, "batch_loss_computation_technique": batch_loss_computation_technique, "fit_start_time": fit_start_time } [ training_components.update(t.training_components) for t in training_techniques ] for t in training_techniques: t.set_up(training_components, pipeline_config, logger) # prepare data X_train, Y_train, X_valid, Y_valid = to_dense(X_train), to_dense( Y_train), to_dense(X_valid), to_dense(Y_valid) X_train, Y_train = torch.from_numpy(X_train).float(), torch.from_numpy( Y_train) train_data = DataLoader(TensorDataset(X_train, Y_train), batch_size, True) X_valid = torch.from_numpy(X_valid).float().to( device) if X_valid is not None else None Y_valid = torch.from_numpy(Y_valid).to( device) if Y_valid is not None else None # eval specifics. decide which datasets should be evaluated when. after_epoch_eval_specifics = { "train": any(t.needs_eval_on_train_each_epoch() for t in training_techniques) or (pipeline_config["full_eval_each_epoch"] and pipeline_config["eval_on_training"]), "valid": any(t.needs_eval_on_valid_each_epoch() for t in training_techniques) or pipeline_config["full_eval_each_epoch"], "logs": pipeline_config["full_eval_each_epoch"] } after_training_eval_specifics = { "train": not after_epoch_eval_specifics["train"] and (pipeline_config["eval_on_training"] or X_valid is None or Y_valid is None), "valid": not after_epoch_eval_specifics["valid"], "logs": not after_epoch_eval_specifics["logs"] } eval_specifics = { "after_epoch": after_epoch_eval_specifics, "after_training": after_training_eval_specifics } return training_components, train_data, X_train, Y_train, X_valid, Y_valid, eval_specifics