def main( args, args_file_path: str, tmp_results_dir: str, train_log_file_path: str, ) -> None: device = 'cuda' if torch.cuda.is_available() else 'cpu' train_data_dict = get_dataset(args).train_data_dict train_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['train_sampler'], ) validation_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['validation_sampler'], ) model = get_model( args=args, device=device, hparams={ 'learning rate': args.TRAIN.LR, 'batch size': args.TRAIN.BATCH_SIZE, }, ).to(device) mlflow_logger = MLFlowLogger(experiment_name=args.MLFLOW.EXPERIMENT_NAME, ) checkpoint_callback = ModelCheckpoint(monitor='validation_accuracy') trainer = pl.Trainer( callbacks=[checkpoint_callback], distributed_backend=args.TRAIN.DISTRIBUTED_BACKEND, gpus=args.TRAIN.GPUS, logger=mlflow_logger, max_epochs=args.TRAIN.MAX_EPOCHS, replace_sampler_ddp=False, ) try: exist_error = False trainer.fit(model, train_dataloader, validation_dataloader) except Exception: run_id = mlflow_logger.run_id if run_id is not None: error_file_path = join(tmp_results_dir, 'error_log.txt') with open(error_file_path, 'w') as f: traceback.print_exc(file=f) exist_error = True print() print('Failed to train. See error_log.txt on mlflow.') print(f'Experiment name: {args.MLFLOW.EXPERIMENT_NAME}') print(f'Run id: {run_id}') sys.exit(1) finally: run_id = mlflow_logger.run_id if run_id is not None: with open(args_file_path, 'w') as f: with redirect_stdout(f): print(args.dump()) mlflow_client = MlflowClient() mlflow_client.log_artifact(run_id, args_file_path) mlflow_client.log_artifact(run_id, train_log_file_path) if exist_error: mlflow_client.log_artifact(run_id, error_file_path) rmtree(tmp_results_dir, ignore_errors=True)
def objective(trial, args, tmp_results_dir: str) -> float: timestamp = datetime.today().strftime('%Y-%m-%d-%H:%M:%S') cur_tmp_results_dir = join(tmp_results_dir, timestamp) makedirs(cur_tmp_results_dir, exist_ok=True) args_file_path = join(cur_tmp_results_dir, 'args.yaml') train_log_file_path = join(cur_tmp_results_dir, 'log.txt') device = 'cuda' if torch.cuda.is_available() else 'cpu' train_data_dict = get_dataset(args).train_data_dict train_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['train_sampler'], ) validation_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['validation_sampler'], ) model = get_model( args=args, device=device, trial=trial, # if you want to decide hparams with trial, you don't need to write any values here. # This code decides learning rate with trial and record it in model's configure_optimizers method. hparams={ 'batch size': args.TRAIN.BATCH_SIZE, }, ).to(device) mlflow_logger = MLFlowLogger(experiment_name=args.MLFLOW.EXPERIMENT_NAME, ) checkpoint_callback = ModelCheckpoint(monitor='validation_accuracy') trainer = pl.Trainer( callbacks=[checkpoint_callback], distributed_backend=args.TRAIN.DISTRIBUTED_BACKEND, gpus=args.TRAIN.GPUS, logger=mlflow_logger, max_epochs=args.TRAIN.MAX_EPOCHS, replace_sampler_ddp=False, ) try: exist_error = False print(f'To see training logs, you can check {train_log_file_path}') with open(train_log_file_path, 'w') as f: with redirect_stdout(f): trainer.fit(model, train_dataloader, validation_dataloader) except Exception: run_id = mlflow_logger.run_id if run_id is not None: error_file_path = join(tmp_results_dir, 'error_log.txt') with open(error_file_path, 'w') as f: traceback.print_exc(file=f) exist_error = True print() print('Failed to train. See error_log.txt on mlflow.') print(f'Experiment name: {args.MLFLOW.EXPERIMENT_NAME}') print(f'Run id: {run_id}') sys.exit(1) finally: run_id = mlflow_logger.run_id if run_id is not None: with open(args_file_path, 'w') as f: with redirect_stdout(f): print(args.dump()) mlflow_client = MlflowClient() mlflow_client.log_artifact(run_id, args_file_path) mlflow_client.log_artifact(run_id, train_log_file_path) if exist_error: mlflow_client.log_artifact(run_id, error_file_path) rmtree(cur_tmp_results_dir, ignore_errors=True) return checkpoint_callback.best_model_score
class MLflowWriter(object): def __init__(self, exp_name, save_dir, log_every, **mlflow_cfg): mlflow.set_tracking_uri(save_dir) self.client = MlflowClient(**mlflow_cfg) mlflow.set_experiment(exp_name) self.experiment_id = self.client.get_experiment_by_name( exp_name).experiment_id self.run_id = self.client.create_run(self.experiment_id).info.run_id self.log_every = log_every self.clear() def log_params_from_omegaconf(self, params): self._explore_recursive("", params) def _explore_recursive(self, parent_name, element): if isinstance(element, DictConfig): iterator = element.items() elif isinstance(element, ListConfig): iterator = enumerate(element) for k, v in iterator: if isinstance(v, DictConfig) or isinstance(v, ListConfig): self._explore_recursive(f"{parent_name}{k}.", v) else: self.client.log_param( self.run_id, f"{parent_name}{k}", v) def log_torch_model(self, model, epoch): with mlflow.start_run(self.run_id): mlflow.pytorch.log_model(model, "model_%04d" % epoch) def log_metric(self, key, value, is_training): if isinstance(value, torch.Tensor): value = float(value.detach().cpu().numpy()) metric_name = "train/" if is_training else "valid/" metric_name += str(key) if key in self.metrics: self.metrics[metric_name].append(value) else: self.metrics[metric_name] = [value] def next_iteration(self): self.iterations += 1 if self.iterations % self.log_every == 0: self.toMlflow(nb_data=self.log_every) def toMlflow(self, nb_data=0, step=0): for key, value in self.metrics.items(): self.client.log_metric( self.run_id, key, np.mean(value[-nb_data:]), step=step) def get_mean(self, key, is_training): metric_name = "train/" if is_training else "valid/" metric_name += str(key) return np.mean(self.metrics[metric_name]) def clear(self): self.metrics = {} self.iterations = 0 def log_artifact(self, path): self.client.log_artifact(self.run_id, local_path=path) def terminate(self): self.client.set_terminated(self.run_id)