def test_H5Dataset(): """Read HDF5 in parallel There exist some issues of hdf5 handlers. It could be solved by loading hdf5 on-the-fly. However, the drawback is that it will load multiple copies into memory for multiple processes. """ set_random_seed(0) size = 10 with tempfile.TemporaryDirectory() as tmpdirname: filename = tmpdirname + '/data.h5' h5_file = h5py.File(filename, mode='w') h5_file.create_dataset('data', data=np.arange(size)) h5_file.close() dataset = H5Dataset(filename, size) dataloader = DataLoader( dataset, batch_size=1, shuffle=False, collate_fn=lambda x: x, num_workers=2, ) print('-' * 8) for x in dataloader: print(x)
def test_dataloader(): set_random_seed(0) dataset = RandomDataset() # ---------------------------------------------------------------------------- # # It is expected that every two batches contain same numpy random results. # And even for next round it still gets the same results. # ---------------------------------------------------------------------------- # dataloader = DataLoader( dataset, batch_size=1, shuffle=True, collate_fn=lambda x: x, num_workers=2, # worker_init_fn=worker_init_fn, ) print('Without worker_init_fn') for _ in range(2): print('-' * 8) for x in dataloader: print(x) # ---------------------------------------------------------------------------- # # By initializing the worker, this issue could be solved. # ---------------------------------------------------------------------------- # dataloader = DataLoader( dataset, batch_size=1, shuffle=True, collate_fn=lambda x: x, num_workers=2, worker_init_fn=worker_init_fn, ) print('With worker_init_fn') for _ in range(2): print('-' * 8) for x in dataloader: print(x)
def main(): # ---------------------------------------------------------------------------- # # Setup the experiment # ---------------------------------------------------------------------------- # args = parse_args() # load the configuration cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) purge_cfg(cfg) cfg.freeze() # run name timestamp = time.strftime('%m-%d_%H-%M-%S') hostname = socket.gethostname() run_name = '{:s}.{:s}'.format(timestamp, hostname) output_dir = cfg.OUTPUT_DIR # replace '@' with config path if output_dir: config_path = osp.splitext(args.config_file)[0] output_dir = output_dir.replace( '@', config_path.replace('configs', 'outputs')) if args.dev: output_dir = osp.join(output_dir, run_name) warnings.warn('Dev mode enabled.') if osp.isdir(output_dir): warnings.warn('Output directory exists.') os.makedirs(output_dir, exist_ok=True) logger = setup_logger('train', output_dir, filename='log.train.{:s}.txt'.format(run_name)) logger.info('{:d} GPUs available'.format(torch.cuda.device_count())) logger.info(args) from common.utils.collect_env import collect_env_info logger.info('Collecting env info (might take some time)\n' + collect_env_info()) logger.info('Loaded configuration file {:s}'.format(args.config_file)) logger.info('Running with config:\n{}'.format(cfg)) # ---------------------------------------------------------------------------- # # Build models, optimizer, scheduler, checkpointer, etc. # ---------------------------------------------------------------------------- # # build model set_random_seed(cfg.RNG_SEED) model = build_model(cfg) logger.info('Build model:\n{}'.format(str(model))) # Currently only support single-gpu mode model = model.cuda() # build optimizer optimizer = build_optimizer(cfg, model) # build lr scheduler lr_scheduler = build_lr_scheduler(cfg, optimizer) # build checkpointer # Note that checkpointer will load state_dict of model, optimizer and scheduler. checkpointer = CheckpointerV2(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=output_dir, logger=logger, max_to_keep=cfg.TRAIN.MAX_TO_KEEP) checkpoint_data = checkpointer.load(cfg.RESUME_PATH, resume=cfg.AUTO_RESUME, resume_states=cfg.RESUME_STATES, strict=cfg.RESUME_STRICT) ckpt_period = cfg.TRAIN.CHECKPOINT_PERIOD start_iter = checkpoint_data.get('iteration', 0) # build data loader # Reset the random seed again in case the initialization of models changes the random state. set_random_seed(cfg.RNG_SEED) train_dataloader = build_gnn_dataloader(cfg, True, start_iter) logger.info(train_dataloader.dataset) # build metrics train_meters = MetricLogger(delimiter=' ') def setup_train(): model.train() train_meters.reset() # Build tensorboard logger summary_writer = None if output_dir: tb_dir = output_dir summary_writer = SummaryWriter(tb_dir, max_queue=64, flush_secs=30) # ---------------------------------------------------------------------------- # # Setup validation # ---------------------------------------------------------------------------- # val_period = cfg.VAL.PERIOD do_validation = val_period > 0 if do_validation: val_dataloader = build_gnn_dataloader(cfg, training=False) logger.info(val_dataloader.dataset) val_meters = MetricLogger(delimiter=' ') best_metric_name = 'best_{}'.format(cfg.VAL.METRIC) best_metric = checkpoint_data.get(best_metric_name, None) def setup_validate(): model.eval() val_meters.reset() # ---------------------------------------------------------------------------- # # Training begins. # ---------------------------------------------------------------------------- # setup_train() max_iter = cfg.TRAIN.MAX_ITER logger.info('Start training from iteration {}'.format(start_iter)) tic = time.time() for iteration, data_batch in enumerate(train_dataloader, start_iter): cur_iter = iteration + 1 data_time = time.time() - tic # copy data from cpu to gpu data_batch = data_batch.to('cuda') # forward pd_dict = model(data_batch) # update losses loss_dict = model.compute_losses( pd_dict, data_batch, ) total_loss = sum(loss_dict.values()) # It is slightly faster to update metrics and meters before backward with torch.no_grad(): train_meters.update(total_loss=total_loss, **loss_dict) model.update_metrics(pd_dict, data_batch, train_meters.metrics) # backward optimizer.zero_grad() total_loss.backward() if cfg.OPTIMIZER.MAX_GRAD_NORM > 0: # CAUTION: built-in clip_grad_norm_ clips the total norm. total_norm = clip_grad_norm_(model.parameters(), max_norm=cfg.OPTIMIZER.MAX_GRAD_NORM) else: total_norm = None optimizer.step() batch_time = time.time() - tic train_meters.update(time=batch_time, data=data_time) # log log_period = cfg.TRAIN.LOG_PERIOD if log_period > 0 and (cur_iter % log_period == 0 or cur_iter == 1): logger.info( train_meters.delimiter.join([ 'iter: {iter:4d}', '{meters}', 'lr: {lr:.2e}', 'max mem: {memory:.0f}', ]).format( iter=cur_iter, meters=str(train_meters), lr=optimizer.param_groups[0]['lr'], memory=torch.cuda.max_memory_allocated() / (1024.0**2), )) # summary summary_period = cfg.TRAIN.SUMMARY_PERIOD if summary_writer is not None and (summary_period > 0 and cur_iter % summary_period == 0): keywords = ( 'loss', 'acc', ) for name, metric in train_meters.metrics.items(): if all(k not in name for k in keywords): continue summary_writer.add_scalar('train/' + name, metric.result, global_step=cur_iter) # summarize gradient norm if total_norm is not None: summary_writer.add_scalar('grad_norm', total_norm, global_step=cur_iter) # ---------------------------------------------------------------------------- # # validate for one epoch # ---------------------------------------------------------------------------- # if do_validation and (cur_iter % val_period == 0 or cur_iter == max_iter): setup_validate() logger.info('Validation begins at iteration {}.'.format(cur_iter)) start_time_val = time.time() tic = time.time() for iteration_val, data_batch in enumerate(val_dataloader): data_time = time.time() - tic # copy data from cpu to gpu data_batch = data_batch.to('cuda') # forward with torch.no_grad(): pd_dict = model(data_batch) # update losses and metrics loss_dict = model.compute_losses(pd_dict, data_batch) total_loss = sum(loss_dict.values()) # update metrics and meters val_meters.update(loss=total_loss, **loss_dict) model.update_metrics(pd_dict, data_batch, val_meters.metrics) batch_time = time.time() - tic val_meters.update(time=batch_time, data=data_time) tic = time.time() if cfg.VAL.LOG_PERIOD > 0 and iteration_val % cfg.VAL.LOG_PERIOD == 0: logger.info( val_meters.delimiter.join([ 'iter: {iter:4d}', '{meters}', 'max mem: {memory:.0f}', ]).format( iter=iteration, meters=str(val_meters), memory=torch.cuda.max_memory_allocated() / (1024.0**2), )) # END: validation loop epoch_time_val = time.time() - start_time_val logger.info('Iteration[{}]-Val {} total_time: {:.2f}s'.format( cur_iter, val_meters.summary_str, epoch_time_val)) # summary if summary_writer is not None: keywords = ('loss', 'acc', 'ap', 'recall') for name, metric in val_meters.metrics.items(): if all(k not in name for k in keywords): continue summary_writer.add_scalar('val/' + name, metric.result, global_step=cur_iter) # best validation if cfg.VAL.METRIC in val_meters.metrics: cur_metric = val_meters.metrics[cfg.VAL.METRIC].result if best_metric is None \ or (cfg.VAL.METRIC_ASCEND and cur_metric > best_metric) \ or (not cfg.VAL.METRIC_ASCEND and cur_metric < best_metric): best_metric = cur_metric checkpoint_data['iteration'] = cur_iter checkpoint_data[best_metric_name] = best_metric checkpointer.save('model_best', tag=False, **checkpoint_data) # restore training setup_train() # ---------------------------------------------------------------------------- # # After validation # ---------------------------------------------------------------------------- # # checkpoint if (ckpt_period > 0 and cur_iter % ckpt_period == 0) or cur_iter == max_iter: checkpoint_data['iteration'] = cur_iter if do_validation and best_metric is not None: checkpoint_data[best_metric_name] = best_metric checkpointer.save('model_{:06d}'.format(cur_iter), **checkpoint_data) # ---------------------------------------------------------------------------- # # Finalize one step # ---------------------------------------------------------------------------- # # since pytorch v1.1.0, lr_scheduler is called after optimization. if lr_scheduler is not None: lr_scheduler.step() tic = time.time() # END: training loop if do_validation and cfg.VAL.METRIC: logger.info('Best val-{} = {}'.format(cfg.VAL.METRIC, best_metric))