def single_gpu_test(model, dataset, show=False): # create a loader for this runner tf_dataset, num_examples = build_dataloader(dataset, 1, 1, num_gpus=1, dist=False) results = [] start = time.time() for i, data_batch in enumerate(tf_dataset): if i >= num_examples: break _, img_meta = data_batch print(dataset.img_ids[i]) outputs = model(data_batch, training=False) bboxes = outputs['bboxes'] # # map boxes back to original scale bboxes = transforms.bbox_mapping_back(bboxes, img_meta) # # print('>>>>', bboxes) labels = outputs['labels'] scores = outputs['scores'] result = transforms.bbox2result(bboxes, labels, scores, num_classes=81) #for b, l, s in zip(bboxes, labels, scores): # print(b, l, s) #print(result) results.append(result) print("Forward pass through test set took {}s".format(time.time() - start)) evaluate(dataset, results) return results
def main(cfg): ###################################################################################### # Create Training Data ###################################################################################### datasets = build_dataset(cfg.data.train) tf_datasets = [ build_dataloader(datasets, cfg.batch_size_per_device, cfg.workers_per_gpu, num_gpus=hvd.size(), dist=True) ] ###################################################################################### # Build Model ###################################################################################### model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # Pass example through so tensor shapes are defined model.CLASSES = datasets.CLASSES _ = model(next(iter(tf_datasets[0][0]))) model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False) ###################################################################################### # Create Model Runner ###################################################################################### runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name, optimizer=cfg.optimizer, work_dir=cfg.work_dir, logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16, loss_weights=cfg.loss_weights) runner.timestamp = int(time()) ###################################################################################### # Setup Training Hooks ###################################################################################### runner.register_hook( checkpoint.CheckpointHook(interval=cfg.checkpoint_interval, out_dir=cfg.outputs_path, s3_dir=None)) runner.register_hook( CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval)) runner.register_hook(iter_timer.IterTimerHook()) runner.register_hook(text.TextLoggerHook()) runner.register_hook( visualizer.Visualizer(cfg.data.val, interval=100, top_k=10)) runner.register_hook( tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path, interval=10, image_interval=100, s3_dir=None)) ###################################################################################### # Run Model ###################################################################################### runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)
def after_train_epoch(self, runner): if not self.every_n_epochs(runner, self.interval): return # create a loader for this runner tf_dataset, num_examples = build_dataloader(self.dataset, 1, 1, num_gpus=runner.local_size, dist=True) # num_examples=8 results = [None for _ in range(num_examples * runner.local_size) ] # REVISIT - may require a lot of memory #if runner.model.mask: if self.dataset.mask: masks = [None for _ in range(num_examples * runner.local_size)] if runner.rank == 0: prog_bar = ProgressBar(num_examples) for i, data_batch in enumerate(tf_dataset): if i >= num_examples: break _, img_meta = data_batch outputs = runner.model(data_batch, training=False) assert isinstance(outputs, dict) bboxes = outputs['bboxes'] # map boxes back to original scale bboxes = transforms.bbox_mapping_back(bboxes, img_meta) labels = outputs['labels'] scores = outputs['scores'] result = transforms.bbox2result(bboxes, labels, scores, num_classes=self.dataset.CLASSES + 1) # add background class #if runner.model.mask: if self.dataset.mask: mask = mask2result(outputs['masks'], labels, img_meta[0]) results[i * runner.local_size + runner.local_rank] = (result, mask) else: results[i * runner.local_size + runner.local_rank] = result if runner.rank == 0: prog_bar.update() # write to a file tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank)) if runner.rank != 0: dump(results, tmp_file) # open(tmp_file+'.done', 'w').close() # MPI barrier through horovod _ = get_barrier() self._accumulate_results(runner, results, num_examples)
def _dist_train(model, dataset, cfg, num_gpus=1, mixed_precision=False, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] tf_datasets = [ build_dataloader(ds, cfg.data.imgs_per_gpu, 1, num_gpus=num_gpus, dist=True) for ds in dataset ] # build runner optimizer = build_optimizer(cfg.optimizer) if mixed_precision: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') optimizer_config = cfg.optimizer_config optimizer_config['amp_enabled'] = mixed_precision gradient_clip = optimizer_config.get('gradient_clip', 15.0) # default is 15.0 runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, amp_enabled=mixed_precision, gradient_clip=gradient_clip) runner.timestamp = timestamp # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, num_gpus=1, mixed_precision=False, validate=False, logger=None, timestamp=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] tf_datasets = [ build_dataloader(ds, cfg.data.imgs_per_gpu, 1, num_gpus=num_gpus, dist=True) for ds in dataset ] # build runner optimizer = build_optimizer(cfg.optimizer) if mixed_precision: # broken in TF 2.1 # optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale='dynamic') runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, amp_enabled=mixed_precision) # workaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = cfg.optimizer_config optimizer_config['amp_enabled'] = mixed_precision # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register eval hooks if validate and runner.rank < runner.local_size: # register this dist eval hook only for Node 0 val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
def __init__(self, dataset_cfg, interval=1000, threshold=0.75, figsize=(8, 8), top_k=10): self.dataset = datasets.build_dataset(dataset_cfg) self.tf_dataset, self.num_examples = datasets.build_dataloader( self.dataset, 1, 1, num_gpus=1, dist=False) self.tf_dataset = iter( self.tf_dataset.prefetch(16).shuffle(4).repeat()) self.interval = interval self.img_mean = dataset_cfg.mean self.threshold = threshold self.figsize = figsize self.top_k = top_k self.threads = ThreadPoolExecutor()
def _non_dist_train(model, dataset, cfg, mixed_precision=False, validate=False, logger=None, timestamp=None): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] tf_datasets = [ build_dataloader(ds, cfg.data.imgs_per_gpu, 1, dist=False) for ds in dataset ] # build runner optimizer = build_optimizer(cfg.optimizer) # broken in TF2.1 # if mixed_precision: # optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 1024.0) # "dynamic") runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger) # workaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = cfg.optimizer_config optimizer_config['amp_enabled'] = mixed_precision runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(tf_datasets, cfg.workflow, cfg.total_epochs)
def __init__(self, dataset_cfg, interval=1000, threshold=0.75, figsize=(8, 8), top_k=10, run_on_sagemaker=False): if run_on_sagemaker: # update paths for SM import os, pathlib data_root = pathlib.Path( os.getenv('SM_CHANNEL_COCO')).joinpath('coco').as_posix() dataset_cfg['dataset_dir'] = data_root self.dataset = datasets.build_dataset(dataset_cfg) self.tf_dataset, self.num_examples = datasets.build_dataloader( self.dataset, 1, 1, num_gpus=1, dist=False) self.tf_dataset = iter( self.tf_dataset.prefetch(16).shuffle(4).repeat()) self.interval = interval self.img_mean = dataset_cfg.mean self.threshold = threshold self.figsize = figsize self.top_k = top_k self.threads = ThreadPoolExecutor()
def main(cfg): decompress_data(cfg) ###################################################################################### # Create Training Data ###################################################################################### cfg.global_batch_size = cfg.batch_size_per_device * hvd.size() cfg.steps_per_epoch = cfg.coco_images // cfg.global_batch_size datasets = build_dataset(cfg.data.train) tf_datasets = [ build_dataloader(datasets, cfg.batch_size_per_device, cfg.workers_per_gpu, num_gpus=hvd.size(), dist=True) ] ###################################################################################### # Build Model ###################################################################################### #update any hyperparams that we may have passed in via arguments if cfg.ls > 0.0: cfg.model['bbox_head']['label_smoothing'] = cfg.ls if cfg.use_rcnn_bn: cfg.model['bbox_head']['use_bn'] = cfg.use_rcnn_bn if cfg.use_conv: cfg.model['bbox_head']['use_conv'] = cfg.use_conv cfg.schedule = args.schedule model = build_detector(cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) # Pass example through so tensor shapes are defined _ = model(next(iter(tf_datasets[0][0]))) model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False) ###################################################################################### # Build optimizer and associate scheduler ###################################################################################### # base learning rate is set for global batch size of 8, with linear scaling for larger batches base_learning_rate = cfg.base_learning_rate scaled_learning_rate = base_learning_rate * cfg.global_batch_size / 8 steps_per_epoch = cfg.steps_per_epoch if cfg.schedule == '1x': scheduler = tf.keras.optimizers.schedules.PiecewiseConstantDecay( [steps_per_epoch * 8, steps_per_epoch * 10], [ scaled_learning_rate, scaled_learning_rate * 0.1, scaled_learning_rate * 0.01 ]) elif cfg.schedule == 'cosine': scheduler = tf.keras.experimental.CosineDecayRestarts( initial_learning_rate=scaled_learning_rate, first_decay_steps=12 * steps_per_epoch, t_mul=1, m_mul=1) #0-1-13 else: raise NotImplementedError warmup_init_lr = 1.0 / cfg.warmup_init_lr_scale * scaled_learning_rate scheduler = WarmupScheduler(scheduler, warmup_init_lr, cfg.warmup_steps) # FIXME: currently hardcoded to SGD optimizer = tf.keras.optimizers.SGD(scheduler, momentum=0.9, nesterov=False) if cfg.fp16: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale='dynamic') ###################################################################################### # Create Model Runner ###################################################################################### runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name, optimizer=optimizer, work_dir=cfg.work_dir, logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16, loss_weights=cfg.loss_weights) runner.timestamp = int(time()) ###################################################################################### # Setup Training Hooks ###################################################################################### runner.register_hook( checkpoint.CheckpointHook(interval=cfg.checkpoint_interval, out_dir=cfg.outputs_path, s3_dir=cfg.s3_checkpoints, h5=True)) runner.register_hook( CocoDistEvalmAPHook(cfg.data.val, interval=cfg.evaluation_interval)) runner.register_hook(iter_timer.IterTimerHook()) runner.register_hook(text.TextLoggerHook()) runner.register_hook( visualizer.Visualizer(cfg.data.val, interval=100, top_k=10)) runner.register_hook( tensorboard.TensorboardLoggerHook(log_dir=cfg.outputs_path, image_interval=100, s3_dir=cfg.s3_tensorboard)) ###################################################################################### # Run Model ###################################################################################### runner.run(tf_datasets, cfg.workflow, cfg.training_epochs)