コード例 #1
0
ファイル: train.py プロジェクト: ryuji0123/tmp
def main(
    args,
    args_file_path: str,
    tmp_results_dir: str,
    train_log_file_path: str,
) -> None:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    train_data_dict = get_dataset(args).train_data_dict

    train_dataloader = get_dataloader(
        batch_size=args.TRAIN.BATCH_SIZE,
        dataset=train_data_dict['dataset'],
        num_workers=args.DATA.NUM_WORKERS,
        sampler=train_data_dict['train_sampler'],
    )

    validation_dataloader = get_dataloader(
        batch_size=args.TRAIN.BATCH_SIZE,
        dataset=train_data_dict['dataset'],
        num_workers=args.DATA.NUM_WORKERS,
        sampler=train_data_dict['validation_sampler'],
    )

    model = get_model(
        args=args,
        device=device,
        hparams={
            'learning rate': args.TRAIN.LR,
            'batch size': args.TRAIN.BATCH_SIZE,
        },
    ).to(device)

    mlflow_logger = MLFlowLogger(experiment_name=args.MLFLOW.EXPERIMENT_NAME, )

    checkpoint_callback = ModelCheckpoint(monitor='validation_accuracy')

    trainer = pl.Trainer(
        callbacks=[checkpoint_callback],
        distributed_backend=args.TRAIN.DISTRIBUTED_BACKEND,
        gpus=args.TRAIN.GPUS,
        logger=mlflow_logger,
        max_epochs=args.TRAIN.MAX_EPOCHS,
        replace_sampler_ddp=False,
    )

    try:
        exist_error = False
        trainer.fit(model, train_dataloader, validation_dataloader)
    except Exception:
        run_id = mlflow_logger.run_id
        if run_id is not None:
            error_file_path = join(tmp_results_dir, 'error_log.txt')
            with open(error_file_path, 'w') as f:
                traceback.print_exc(file=f)
            exist_error = True
            print()
            print('Failed to train. See error_log.txt on mlflow.')
            print(f'Experiment name: {args.MLFLOW.EXPERIMENT_NAME}')
            print(f'Run id: {run_id}')
            sys.exit(1)
    finally:
        run_id = mlflow_logger.run_id
        if run_id is not None:
            with open(args_file_path, 'w') as f:
                with redirect_stdout(f):
                    print(args.dump())

            mlflow_client = MlflowClient()
            mlflow_client.log_artifact(run_id, args_file_path)
            mlflow_client.log_artifact(run_id, train_log_file_path)
            if exist_error:
                mlflow_client.log_artifact(run_id, error_file_path)
            rmtree(tmp_results_dir, ignore_errors=True)
コード例 #2
0
ファイル: train_with_optuna.py プロジェクト: ryuji0123/tmp
def objective(trial, args, tmp_results_dir: str) -> float:
    timestamp = datetime.today().strftime('%Y-%m-%d-%H:%M:%S')
    cur_tmp_results_dir = join(tmp_results_dir, timestamp)
    makedirs(cur_tmp_results_dir, exist_ok=True)

    args_file_path = join(cur_tmp_results_dir, 'args.yaml')
    train_log_file_path = join(cur_tmp_results_dir, 'log.txt')

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    train_data_dict = get_dataset(args).train_data_dict

    train_dataloader = get_dataloader(
        batch_size=args.TRAIN.BATCH_SIZE,
        dataset=train_data_dict['dataset'],
        num_workers=args.DATA.NUM_WORKERS,
        sampler=train_data_dict['train_sampler'],
    )

    validation_dataloader = get_dataloader(
        batch_size=args.TRAIN.BATCH_SIZE,
        dataset=train_data_dict['dataset'],
        num_workers=args.DATA.NUM_WORKERS,
        sampler=train_data_dict['validation_sampler'],
    )

    model = get_model(
        args=args,
        device=device,
        trial=trial,
        # if you want to decide hparams with trial, you don't need to write any values here.
        # This code decides learning rate with trial and record it in model's configure_optimizers method.
        hparams={
            'batch size': args.TRAIN.BATCH_SIZE,
        },
    ).to(device)

    mlflow_logger = MLFlowLogger(experiment_name=args.MLFLOW.EXPERIMENT_NAME, )

    checkpoint_callback = ModelCheckpoint(monitor='validation_accuracy')

    trainer = pl.Trainer(
        callbacks=[checkpoint_callback],
        distributed_backend=args.TRAIN.DISTRIBUTED_BACKEND,
        gpus=args.TRAIN.GPUS,
        logger=mlflow_logger,
        max_epochs=args.TRAIN.MAX_EPOCHS,
        replace_sampler_ddp=False,
    )

    try:
        exist_error = False
        print(f'To see training logs, you can check {train_log_file_path}')
        with open(train_log_file_path, 'w') as f:
            with redirect_stdout(f):
                trainer.fit(model, train_dataloader, validation_dataloader)
    except Exception:
        run_id = mlflow_logger.run_id
        if run_id is not None:
            error_file_path = join(tmp_results_dir, 'error_log.txt')
            with open(error_file_path, 'w') as f:
                traceback.print_exc(file=f)
            exist_error = True
            print()
            print('Failed to train. See error_log.txt on mlflow.')
            print(f'Experiment name: {args.MLFLOW.EXPERIMENT_NAME}')
            print(f'Run id: {run_id}')
            sys.exit(1)
    finally:
        run_id = mlflow_logger.run_id
        if run_id is not None:
            with open(args_file_path, 'w') as f:
                with redirect_stdout(f):
                    print(args.dump())
            mlflow_client = MlflowClient()
            mlflow_client.log_artifact(run_id, args_file_path)
            mlflow_client.log_artifact(run_id, train_log_file_path)
            if exist_error:
                mlflow_client.log_artifact(run_id, error_file_path)
            rmtree(cur_tmp_results_dir, ignore_errors=True)

    return checkpoint_callback.best_model_score
コード例 #3
0
ファイル: writer.py プロジェクト: kskkwn/optuna_hydra_mlflow
class MLflowWriter(object):
    def __init__(self, exp_name, save_dir, log_every, **mlflow_cfg):
        mlflow.set_tracking_uri(save_dir)
        self.client = MlflowClient(**mlflow_cfg)
        mlflow.set_experiment(exp_name)
        self.experiment_id = self.client.get_experiment_by_name(
            exp_name).experiment_id
        self.run_id = self.client.create_run(self.experiment_id).info.run_id

        self.log_every = log_every
        self.clear()

    def log_params_from_omegaconf(self, params):
        self._explore_recursive("", params)

    def _explore_recursive(self, parent_name, element):
        if isinstance(element, DictConfig):
            iterator = element.items()
        elif isinstance(element, ListConfig):
            iterator = enumerate(element)

        for k, v in iterator:
            if isinstance(v, DictConfig) or isinstance(v, ListConfig):
                self._explore_recursive(f"{parent_name}{k}.", v)
            else:
                self.client.log_param(
                    self.run_id, f"{parent_name}{k}", v)

    def log_torch_model(self, model, epoch):
        with mlflow.start_run(self.run_id):
            mlflow.pytorch.log_model(model, "model_%04d" % epoch)

    def log_metric(self, key, value, is_training):
        if isinstance(value, torch.Tensor):
            value = float(value.detach().cpu().numpy())

        metric_name = "train/" if is_training else "valid/"
        metric_name += str(key)

        if key in self.metrics:
            self.metrics[metric_name].append(value)
        else:
            self.metrics[metric_name] = [value]

    def next_iteration(self):
        self.iterations += 1
        if self.iterations % self.log_every == 0:
            self.toMlflow(nb_data=self.log_every)

    def toMlflow(self, nb_data=0, step=0):
        for key, value in self.metrics.items():
            self.client.log_metric(
                self.run_id, key,
                np.mean(value[-nb_data:]), step=step)

    def get_mean(self, key, is_training):
        metric_name = "train/" if is_training else "valid/"
        metric_name += str(key)

        return np.mean(self.metrics[metric_name])

    def clear(self):
        self.metrics = {}
        self.iterations = 0

    def log_artifact(self, path):
        self.client.log_artifact(self.run_id, local_path=path)

    def terminate(self):
        self.client.set_terminated(self.run_id)