コード例 #1
0
def test_H5Dataset():
    """Read HDF5 in parallel

    There exist some issues of hdf5 handlers. It could be solved by loading hdf5 on-the-fly.
    However, the drawback is that it will load multiple copies into memory for multiple processes.

    """
    set_random_seed(0)
    size = 10

    with tempfile.TemporaryDirectory() as tmpdirname:
        filename = tmpdirname + '/data.h5'
        h5_file = h5py.File(filename, mode='w')
        h5_file.create_dataset('data', data=np.arange(size))
        h5_file.close()
        dataset = H5Dataset(filename, size)

        dataloader = DataLoader(
            dataset,
            batch_size=1,
            shuffle=False,
            collate_fn=lambda x: x,
            num_workers=2,
        )

        print('-' * 8)
        for x in dataloader:
            print(x)
コード例 #2
0
def test_dataloader():
    set_random_seed(0)
    dataset = RandomDataset()

    # ---------------------------------------------------------------------------- #
    # It is expected that every two batches contain same numpy random results.
    # And even for next round it still gets the same results.
    # ---------------------------------------------------------------------------- #
    dataloader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=True,
        collate_fn=lambda x: x,
        num_workers=2,
        # worker_init_fn=worker_init_fn,
    )

    print('Without worker_init_fn')
    for _ in range(2):
        print('-' * 8)
        for x in dataloader:
            print(x)

    # ---------------------------------------------------------------------------- #
    # By initializing the worker, this issue could be solved.
    # ---------------------------------------------------------------------------- #
    dataloader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=True,
        collate_fn=lambda x: x,
        num_workers=2,
        worker_init_fn=worker_init_fn,
    )

    print('With worker_init_fn')
    for _ in range(2):
        print('-' * 8)
        for x in dataloader:
            print(x)
コード例 #3
0
def main():
    # ---------------------------------------------------------------------------- #
    # Setup the experiment
    # ---------------------------------------------------------------------------- #
    args = parse_args()

    # load the configuration
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    purge_cfg(cfg)
    cfg.freeze()

    # run name
    timestamp = time.strftime('%m-%d_%H-%M-%S')
    hostname = socket.gethostname()
    run_name = '{:s}.{:s}'.format(timestamp, hostname)

    output_dir = cfg.OUTPUT_DIR
    # replace '@' with config path
    if output_dir:
        config_path = osp.splitext(args.config_file)[0]
        output_dir = output_dir.replace(
            '@', config_path.replace('configs', 'outputs'))
        if args.dev:
            output_dir = osp.join(output_dir, run_name)
            warnings.warn('Dev mode enabled.')
        if osp.isdir(output_dir):
            warnings.warn('Output directory exists.')
        os.makedirs(output_dir, exist_ok=True)

    logger = setup_logger('train',
                          output_dir,
                          filename='log.train.{:s}.txt'.format(run_name))
    logger.info('{:d} GPUs available'.format(torch.cuda.device_count()))
    logger.info(args)

    from common.utils.collect_env import collect_env_info
    logger.info('Collecting env info (might take some time)\n' +
                collect_env_info())

    logger.info('Loaded configuration file {:s}'.format(args.config_file))
    logger.info('Running with config:\n{}'.format(cfg))

    # ---------------------------------------------------------------------------- #
    # Build models, optimizer, scheduler, checkpointer, etc.
    # ---------------------------------------------------------------------------- #
    # build model
    set_random_seed(cfg.RNG_SEED)
    model = build_model(cfg)
    logger.info('Build model:\n{}'.format(str(model)))

    # Currently only support single-gpu mode
    model = model.cuda()

    # build optimizer
    optimizer = build_optimizer(cfg, model)

    # build lr scheduler
    lr_scheduler = build_lr_scheduler(cfg, optimizer)

    # build checkpointer
    # Note that checkpointer will load state_dict of model, optimizer and scheduler.
    checkpointer = CheckpointerV2(model,
                                  optimizer=optimizer,
                                  scheduler=lr_scheduler,
                                  save_dir=output_dir,
                                  logger=logger,
                                  max_to_keep=cfg.TRAIN.MAX_TO_KEEP)
    checkpoint_data = checkpointer.load(cfg.RESUME_PATH,
                                        resume=cfg.AUTO_RESUME,
                                        resume_states=cfg.RESUME_STATES,
                                        strict=cfg.RESUME_STRICT)
    ckpt_period = cfg.TRAIN.CHECKPOINT_PERIOD
    start_iter = checkpoint_data.get('iteration', 0)

    # build data loader
    # Reset the random seed again in case the initialization of models changes the random state.
    set_random_seed(cfg.RNG_SEED)
    train_dataloader = build_gnn_dataloader(cfg, True, start_iter)
    logger.info(train_dataloader.dataset)

    # build metrics
    train_meters = MetricLogger(delimiter='  ')

    def setup_train():
        model.train()
        train_meters.reset()

    # Build tensorboard logger
    summary_writer = None
    if output_dir:
        tb_dir = output_dir
        summary_writer = SummaryWriter(tb_dir, max_queue=64, flush_secs=30)

    # ---------------------------------------------------------------------------- #
    # Setup validation
    # ---------------------------------------------------------------------------- #
    val_period = cfg.VAL.PERIOD
    do_validation = val_period > 0
    if do_validation:
        val_dataloader = build_gnn_dataloader(cfg, training=False)
        logger.info(val_dataloader.dataset)
        val_meters = MetricLogger(delimiter='  ')

        best_metric_name = 'best_{}'.format(cfg.VAL.METRIC)
        best_metric = checkpoint_data.get(best_metric_name, None)

        def setup_validate():
            model.eval()
            val_meters.reset()

    # ---------------------------------------------------------------------------- #
    # Training begins.
    # ---------------------------------------------------------------------------- #
    setup_train()
    max_iter = cfg.TRAIN.MAX_ITER
    logger.info('Start training from iteration {}'.format(start_iter))
    tic = time.time()

    for iteration, data_batch in enumerate(train_dataloader, start_iter):
        cur_iter = iteration + 1
        data_time = time.time() - tic

        # copy data from cpu to gpu
        data_batch = data_batch.to('cuda')

        # forward
        pd_dict = model(data_batch)

        # update losses
        loss_dict = model.compute_losses(
            pd_dict,
            data_batch,
        )
        total_loss = sum(loss_dict.values())

        # It is slightly faster to update metrics and meters before backward
        with torch.no_grad():
            train_meters.update(total_loss=total_loss, **loss_dict)
            model.update_metrics(pd_dict, data_batch, train_meters.metrics)

        # backward
        optimizer.zero_grad()
        total_loss.backward()
        if cfg.OPTIMIZER.MAX_GRAD_NORM > 0:
            # CAUTION: built-in clip_grad_norm_ clips the total norm.
            total_norm = clip_grad_norm_(model.parameters(),
                                         max_norm=cfg.OPTIMIZER.MAX_GRAD_NORM)
        else:
            total_norm = None
        optimizer.step()

        batch_time = time.time() - tic
        train_meters.update(time=batch_time, data=data_time)

        # log
        log_period = cfg.TRAIN.LOG_PERIOD
        if log_period > 0 and (cur_iter % log_period == 0 or cur_iter == 1):
            logger.info(
                train_meters.delimiter.join([
                    'iter: {iter:4d}',
                    '{meters}',
                    'lr: {lr:.2e}',
                    'max mem: {memory:.0f}',
                ]).format(
                    iter=cur_iter,
                    meters=str(train_meters),
                    lr=optimizer.param_groups[0]['lr'],
                    memory=torch.cuda.max_memory_allocated() / (1024.0**2),
                ))

        # summary
        summary_period = cfg.TRAIN.SUMMARY_PERIOD
        if summary_writer is not None and (summary_period > 0
                                           and cur_iter % summary_period == 0):
            keywords = (
                'loss',
                'acc',
            )
            for name, metric in train_meters.metrics.items():
                if all(k not in name for k in keywords):
                    continue
                summary_writer.add_scalar('train/' + name,
                                          metric.result,
                                          global_step=cur_iter)

            # summarize gradient norm
            if total_norm is not None:
                summary_writer.add_scalar('grad_norm',
                                          total_norm,
                                          global_step=cur_iter)

        # ---------------------------------------------------------------------------- #
        # validate for one epoch
        # ---------------------------------------------------------------------------- #
        if do_validation and (cur_iter % val_period == 0
                              or cur_iter == max_iter):
            setup_validate()
            logger.info('Validation begins at iteration {}.'.format(cur_iter))

            start_time_val = time.time()
            tic = time.time()
            for iteration_val, data_batch in enumerate(val_dataloader):
                data_time = time.time() - tic

                # copy data from cpu to gpu
                data_batch = data_batch.to('cuda')

                # forward
                with torch.no_grad():
                    pd_dict = model(data_batch)

                # update losses and metrics
                loss_dict = model.compute_losses(pd_dict, data_batch)
                total_loss = sum(loss_dict.values())

                # update metrics and meters
                val_meters.update(loss=total_loss, **loss_dict)
                model.update_metrics(pd_dict, data_batch, val_meters.metrics)

                batch_time = time.time() - tic
                val_meters.update(time=batch_time, data=data_time)
                tic = time.time()

                if cfg.VAL.LOG_PERIOD > 0 and iteration_val % cfg.VAL.LOG_PERIOD == 0:
                    logger.info(
                        val_meters.delimiter.join([
                            'iter: {iter:4d}',
                            '{meters}',
                            'max mem: {memory:.0f}',
                        ]).format(
                            iter=iteration,
                            meters=str(val_meters),
                            memory=torch.cuda.max_memory_allocated() /
                            (1024.0**2),
                        ))

            # END: validation loop
            epoch_time_val = time.time() - start_time_val
            logger.info('Iteration[{}]-Val {}  total_time: {:.2f}s'.format(
                cur_iter, val_meters.summary_str, epoch_time_val))

            # summary
            if summary_writer is not None:
                keywords = ('loss', 'acc', 'ap', 'recall')
                for name, metric in val_meters.metrics.items():
                    if all(k not in name for k in keywords):
                        continue
                    summary_writer.add_scalar('val/' + name,
                                              metric.result,
                                              global_step=cur_iter)

            # best validation
            if cfg.VAL.METRIC in val_meters.metrics:
                cur_metric = val_meters.metrics[cfg.VAL.METRIC].result
                if best_metric is None \
                        or (cfg.VAL.METRIC_ASCEND and cur_metric > best_metric) \
                        or (not cfg.VAL.METRIC_ASCEND and cur_metric < best_metric):
                    best_metric = cur_metric
                    checkpoint_data['iteration'] = cur_iter
                    checkpoint_data[best_metric_name] = best_metric
                    checkpointer.save('model_best',
                                      tag=False,
                                      **checkpoint_data)

            # restore training
            setup_train()

        # ---------------------------------------------------------------------------- #
        # After validation
        # ---------------------------------------------------------------------------- #
        # checkpoint
        if (ckpt_period > 0
                and cur_iter % ckpt_period == 0) or cur_iter == max_iter:
            checkpoint_data['iteration'] = cur_iter
            if do_validation and best_metric is not None:
                checkpoint_data[best_metric_name] = best_metric
            checkpointer.save('model_{:06d}'.format(cur_iter),
                              **checkpoint_data)

        # ---------------------------------------------------------------------------- #
        # Finalize one step
        # ---------------------------------------------------------------------------- #
        # since pytorch v1.1.0, lr_scheduler is called after optimization.
        if lr_scheduler is not None:
            lr_scheduler.step()
        tic = time.time()

    # END: training loop
    if do_validation and cfg.VAL.METRIC:
        logger.info('Best val-{} = {}'.format(cfg.VAL.METRIC, best_metric))