def main(): args = parse_args() # load the configuration # import on-the-fly to avoid overwriting cfg from common.config import purge_cfg from mvpnet.config.mvpnet_3d import cfg cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) purge_cfg(cfg) cfg.freeze() output_dir = cfg.OUTPUT_DIR # replace '@' with config path if output_dir: config_path = osp.splitext(args.config_file)[0] output_dir = output_dir.replace( '@', config_path.replace('configs', 'outputs')) if not osp.isdir(output_dir): warnings.warn('Make a new directory: {}'.format(output_dir)) os.makedirs(output_dir) # run name timestamp = time.strftime('%m-%d_%H-%M-%S') hostname = socket.gethostname() run_name = '{:s}.{:s}'.format(timestamp, hostname) logger = setup_logger('mvpnet', output_dir, comment='test.{:s}'.format(run_name)) logger.info('{:d} GPUs available'.format(torch.cuda.device_count())) logger.info(args) from common.utils.misc import collect_env_info logger.info('Collecting env info (might take some time)\n' + collect_env_info()) logger.info('Loaded configuration file {:s}'.format(args.config_file)) logger.info('Running with config:\n{}'.format(cfg)) assert cfg.TASK == 'mvpnet_3d' test(cfg, args, output_dir, run_name)
def main(): args = parse_args() # load the configuration # import on-the-fly to avoid overwriting cfg from common.config import purge_cfg from mvpnet.config.sem_seg_2d import cfg # cfg.merge_from_file(args.config_file) cfg.merge_from_file('/home/dchangyu/mvpnet/configs/scannet/unet_resnet34.yaml') cfg.merge_from_list(args.opts) purge_cfg(cfg) cfg.freeze() output_dir = cfg.OUTPUT_DIR # replace '@' with config path if output_dir: # config_path = osp.splitext(args.config_file)[0] config_path = '/home/dchangyu/mvpnet/configs/scannet/unet_resnet34.yaml' output_dir = output_dir.replace('@', config_path.replace('configs', 'outputs')) if osp.isdir(output_dir): warnings.warn('Output directory exists.') os.makedirs(output_dir, exist_ok=True) # run name timestamp = time.strftime('%m-%d_%H-%M-%S') hostname = socket.gethostname() run_name = '{:s}.{:s}'.format(timestamp, hostname) logger = setup_logger('mvpnet', output_dir, comment='train.{:s}'.format(run_name)) logger.info('{:d} GPUs available'.format(torch.cuda.device_count())) logger.info(args) from common.utils.misc import collect_env_info logger.info('Collecting env info (might take some time)\n' + collect_env_info()) logger.info('Loaded configuration file {:s}'.format(args.config_file)) logger.info('Running with config:\n{}'.format(cfg)) assert cfg.TASK == 'sem_seg_2d' train(cfg, output_dir, run_name)
def main(): # ---------------------------------------------------------------------------- # # Setup the experiment # ---------------------------------------------------------------------------- # args = parse_args() # load the configuration cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) purge_cfg(cfg) cfg.freeze() # run name timestamp = time.strftime('%m-%d_%H-%M-%S') hostname = socket.gethostname() run_name = '{:s}.{:s}'.format(timestamp, hostname) output_dir = cfg.OUTPUT_DIR # replace '@' with config path if output_dir: config_path = osp.splitext(args.config_file)[0] output_dir = output_dir.replace( '@', config_path.replace('configs', 'outputs')) if args.dev: output_dir = osp.join(output_dir, run_name) warnings.warn('Dev mode enabled.') if osp.isdir(output_dir): warnings.warn('Output directory exists.') os.makedirs(output_dir, exist_ok=True) logger = setup_logger('train', output_dir, filename='log.train.{:s}.txt'.format(run_name)) logger.info('{:d} GPUs available'.format(torch.cuda.device_count())) logger.info(args) from common.utils.collect_env import collect_env_info logger.info('Collecting env info (might take some time)\n' + collect_env_info()) logger.info('Loaded configuration file {:s}'.format(args.config_file)) logger.info('Running with config:\n{}'.format(cfg)) # ---------------------------------------------------------------------------- # # Build models, optimizer, scheduler, checkpointer, etc. # ---------------------------------------------------------------------------- # # build model set_random_seed(cfg.RNG_SEED) model = build_model(cfg) logger.info('Build model:\n{}'.format(str(model))) # Currently only support single-gpu mode model = model.cuda() # build optimizer optimizer = build_optimizer(cfg, model) # build lr scheduler lr_scheduler = build_lr_scheduler(cfg, optimizer) # build checkpointer # Note that checkpointer will load state_dict of model, optimizer and scheduler. checkpointer = CheckpointerV2(model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=output_dir, logger=logger, max_to_keep=cfg.TRAIN.MAX_TO_KEEP) checkpoint_data = checkpointer.load(cfg.RESUME_PATH, resume=cfg.AUTO_RESUME, resume_states=cfg.RESUME_STATES, strict=cfg.RESUME_STRICT) ckpt_period = cfg.TRAIN.CHECKPOINT_PERIOD start_iter = checkpoint_data.get('iteration', 0) # build data loader # Reset the random seed again in case the initialization of models changes the random state. set_random_seed(cfg.RNG_SEED) train_dataloader = build_gnn_dataloader(cfg, True, start_iter) logger.info(train_dataloader.dataset) # build metrics train_meters = MetricLogger(delimiter=' ') def setup_train(): model.train() train_meters.reset() # Build tensorboard logger summary_writer = None if output_dir: tb_dir = output_dir summary_writer = SummaryWriter(tb_dir, max_queue=64, flush_secs=30) # ---------------------------------------------------------------------------- # # Setup validation # ---------------------------------------------------------------------------- # val_period = cfg.VAL.PERIOD do_validation = val_period > 0 if do_validation: val_dataloader = build_gnn_dataloader(cfg, training=False) logger.info(val_dataloader.dataset) val_meters = MetricLogger(delimiter=' ') best_metric_name = 'best_{}'.format(cfg.VAL.METRIC) best_metric = checkpoint_data.get(best_metric_name, None) def setup_validate(): model.eval() val_meters.reset() # ---------------------------------------------------------------------------- # # Training begins. # ---------------------------------------------------------------------------- # setup_train() max_iter = cfg.TRAIN.MAX_ITER logger.info('Start training from iteration {}'.format(start_iter)) tic = time.time() for iteration, data_batch in enumerate(train_dataloader, start_iter): cur_iter = iteration + 1 data_time = time.time() - tic # copy data from cpu to gpu data_batch = data_batch.to('cuda') # forward pd_dict = model(data_batch) # update losses loss_dict = model.compute_losses( pd_dict, data_batch, ) total_loss = sum(loss_dict.values()) # It is slightly faster to update metrics and meters before backward with torch.no_grad(): train_meters.update(total_loss=total_loss, **loss_dict) model.update_metrics(pd_dict, data_batch, train_meters.metrics) # backward optimizer.zero_grad() total_loss.backward() if cfg.OPTIMIZER.MAX_GRAD_NORM > 0: # CAUTION: built-in clip_grad_norm_ clips the total norm. total_norm = clip_grad_norm_(model.parameters(), max_norm=cfg.OPTIMIZER.MAX_GRAD_NORM) else: total_norm = None optimizer.step() batch_time = time.time() - tic train_meters.update(time=batch_time, data=data_time) # log log_period = cfg.TRAIN.LOG_PERIOD if log_period > 0 and (cur_iter % log_period == 0 or cur_iter == 1): logger.info( train_meters.delimiter.join([ 'iter: {iter:4d}', '{meters}', 'lr: {lr:.2e}', 'max mem: {memory:.0f}', ]).format( iter=cur_iter, meters=str(train_meters), lr=optimizer.param_groups[0]['lr'], memory=torch.cuda.max_memory_allocated() / (1024.0**2), )) # summary summary_period = cfg.TRAIN.SUMMARY_PERIOD if summary_writer is not None and (summary_period > 0 and cur_iter % summary_period == 0): keywords = ( 'loss', 'acc', ) for name, metric in train_meters.metrics.items(): if all(k not in name for k in keywords): continue summary_writer.add_scalar('train/' + name, metric.result, global_step=cur_iter) # summarize gradient norm if total_norm is not None: summary_writer.add_scalar('grad_norm', total_norm, global_step=cur_iter) # ---------------------------------------------------------------------------- # # validate for one epoch # ---------------------------------------------------------------------------- # if do_validation and (cur_iter % val_period == 0 or cur_iter == max_iter): setup_validate() logger.info('Validation begins at iteration {}.'.format(cur_iter)) start_time_val = time.time() tic = time.time() for iteration_val, data_batch in enumerate(val_dataloader): data_time = time.time() - tic # copy data from cpu to gpu data_batch = data_batch.to('cuda') # forward with torch.no_grad(): pd_dict = model(data_batch) # update losses and metrics loss_dict = model.compute_losses(pd_dict, data_batch) total_loss = sum(loss_dict.values()) # update metrics and meters val_meters.update(loss=total_loss, **loss_dict) model.update_metrics(pd_dict, data_batch, val_meters.metrics) batch_time = time.time() - tic val_meters.update(time=batch_time, data=data_time) tic = time.time() if cfg.VAL.LOG_PERIOD > 0 and iteration_val % cfg.VAL.LOG_PERIOD == 0: logger.info( val_meters.delimiter.join([ 'iter: {iter:4d}', '{meters}', 'max mem: {memory:.0f}', ]).format( iter=iteration, meters=str(val_meters), memory=torch.cuda.max_memory_allocated() / (1024.0**2), )) # END: validation loop epoch_time_val = time.time() - start_time_val logger.info('Iteration[{}]-Val {} total_time: {:.2f}s'.format( cur_iter, val_meters.summary_str, epoch_time_val)) # summary if summary_writer is not None: keywords = ('loss', 'acc', 'ap', 'recall') for name, metric in val_meters.metrics.items(): if all(k not in name for k in keywords): continue summary_writer.add_scalar('val/' + name, metric.result, global_step=cur_iter) # best validation if cfg.VAL.METRIC in val_meters.metrics: cur_metric = val_meters.metrics[cfg.VAL.METRIC].result if best_metric is None \ or (cfg.VAL.METRIC_ASCEND and cur_metric > best_metric) \ or (not cfg.VAL.METRIC_ASCEND and cur_metric < best_metric): best_metric = cur_metric checkpoint_data['iteration'] = cur_iter checkpoint_data[best_metric_name] = best_metric checkpointer.save('model_best', tag=False, **checkpoint_data) # restore training setup_train() # ---------------------------------------------------------------------------- # # After validation # ---------------------------------------------------------------------------- # # checkpoint if (ckpt_period > 0 and cur_iter % ckpt_period == 0) or cur_iter == max_iter: checkpoint_data['iteration'] = cur_iter if do_validation and best_metric is not None: checkpoint_data[best_metric_name] = best_metric checkpointer.save('model_{:06d}'.format(cur_iter), **checkpoint_data) # ---------------------------------------------------------------------------- # # Finalize one step # ---------------------------------------------------------------------------- # # since pytorch v1.1.0, lr_scheduler is called after optimization. if lr_scheduler is not None: lr_scheduler.step() tic = time.time() # END: training loop if do_validation and cfg.VAL.METRIC: logger.info('Best val-{} = {}'.format(cfg.VAL.METRIC, best_metric))
def main(): # ---------------------------------------------------------------------------- # # Setup the experiment # ---------------------------------------------------------------------------- # args = parse_args() # load the configuration cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) purge_cfg(cfg) cfg.freeze() output_dir = cfg.OUTPUT_DIR # replace '@' with config path if output_dir: config_path = osp.splitext(args.config_file)[0] output_dir = output_dir.replace( '@', config_path.replace('configs', 'outputs')) if not osp.isdir(output_dir): warnings.warn('Make a new directory: {}'.format(output_dir)) os.makedirs(output_dir) # run name timestamp = time.strftime('%m-%d_%H-%M-%S') hostname = socket.gethostname() run_name = '{:s}.{:s}'.format(timestamp, hostname) logger = setup_logger('predict', '', filename=f'log.predict.{run_name}.txt') logger.info('{:d} GPUs available'.format(torch.cuda.device_count())) logger.info(args) from common.utils.collect_env import collect_env_info logger.info('Collecting env info (might take some time)\n' + collect_env_info()) logger.info('Loaded configuration file {:s}'.format(args.config_file)) logger.info('Running with config:\n{}'.format(cfg)) # ---------------------------------------------------------------------------- # # Setup the model and the dataset # ---------------------------------------------------------------------------- # batch_size = args.batch_size det_thresh = args.det_thresh vis_first_n = args.vis_first_n vis_thresh = args.vis_thresh # build model model = build_model(cfg) logger.info('Build model:\n{}'.format(str(model))) model = model.cuda() model.eval() # build checkpointer checkpointer = CheckpointerV2(model, save_dir=output_dir, logger=logger) if args.ckpt_path: # load weight if specified weight_path = args.ckpt_path.replace('@', output_dir) checkpointer.load(weight_path, resume=False) else: # load last checkpoint checkpointer.load(None, resume=True) # build dataset dataset_kwargs = dict(cfg.DATASET.VAL) dataset_kwargs.pop('start', None) dataset_kwargs.pop('end', None) # generate proposals for the whole dataset if cfg.DATASET.NAME == 'FallingDigit': from space.datasets.falling_digit import FallingDigit if args.data_path is not None: dataset_kwargs['path'] = args.data_path dataset = FallingDigit(to_tensor=True, **dataset_kwargs) else: raise ValueError('Unsupported dataset: {}.'.format(cfg.DATASET.NAME)) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1, ) predictions = [] if vis_first_n is not None: vis_dir = osp.join(output_dir, 'vis') os.makedirs(vis_dir, exist_ok=True) else: vis_dir = None # ---------------------------------------------------------------------------- # # Inference # ---------------------------------------------------------------------------- # for batch_idx, data_batch in enumerate(dataloader): # copy data from cpu to gpu data_batch = { k: v.cuda(non_blocking=True) for k, v in data_batch.items() } # forward with torch.no_grad(): preds = model(data_batch, fast=True) boxes = preds['boxes'].cpu().numpy() # (b, A * h1 * w1, 4) z_pres_p = preds['z_pres_p'].cpu().numpy() # (b, A * h1 * w1) for sample_idx, boxes_per_image in enumerate(boxes): boxes_per_image = boxes_per_image.reshape(-1, 4) scores_per_image = z_pres_p[sample_idx] predictions_per_image = {} if det_thresh is not None: valid_mask = scores_per_image >= det_thresh else: valid_mask = np.ones_like(scores_per_image, dtype=bool) predictions_per_image['boxes'] = boxes_per_image[valid_mask] predictions_per_image['scores'] = scores_per_image[valid_mask] predictions.append(predictions_per_image) data_index = batch_idx * batch_size + sample_idx if vis_first_n is not None and (vis_first_n == -1 or data_index < vis_first_n): data = dataset.data[data_index] if 'image' in data: image = data['image'] else: image = data['original_image'] vis_path = osp.join(vis_dir, '{:06d}.png'.format(data_index)) # vis_path = None vis_mask = scores_per_image >= vis_thresh plot_results(image, boxes_per_image[vis_mask], labels=[ '{:.2f}'.format(x) for x in scores_per_image[vis_mask] ], save_path=vis_path) if args.log_period > 0 and batch_idx % args.log_period == 0: print(batch_idx, '/', len(dataloader)) # save output_filename = args.output_filename if output_filename is None: output_filename = 'proposals_' + osp.basename(dataset.path) with open(osp.join(args.output_dir, output_filename), 'wb') as f: pickle.dump(predictions, f, protocol=pickle.HIGHEST_PROTOCOL)