def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): all_predictions = all_gather(predictions_per_gpu) if not is_main_process(): return # merge the list of dicts predictions = {} for p in all_predictions: predictions.update(p) return predictions
def save_func(filename=None, save_str=None): state_dict = { 'model': model_without_ddp.state_dict(), # 'discriminator': dis_model_without_ddp.state_dict(), 'current_epoch': epoch, } filename = filename if filename else 'model_epoch_{:02d}.pth'.format( epoch) save_path = os.path.join(work_dir, filename) dist_utils.save_on_master(state_dict, save_path) if dist_utils.is_main_process() and save_str is not None: with open(os.path.join(work_dir, 'best.txt'), 'w') as f: f.write(save_str) print('Saved to {}'.format(save_path))
def test_func(): global best_mAP updated = False metrics = evaluation(model, test_loaders, device, cfg.TEST.EVAL_TYPES, output_dir=work_dir, iteration=global_step) if dist_utils.is_main_process() and losses_writer: for dataset_name, metric in metrics.items(): for k, v in metric.items(): metrics_writers[dataset_name].add_scalar( 'metrics/' + k, v, global_step=global_step) # if k == 'mAP' and v > best_mAP: if k == 'AP' and v > best_mAP: best_mAP = v updated = True model.train() return updated
def main(cfg, args): train_loader = build_data_loaders(cfg.DATASETS.TRAINS, transforms=cfg.INPUT.TRANSFORMS_TRAIN, is_train=True, distributed=args.distributed, batch_size=cfg.SOLVER.BATCH_SIZE, num_workers=cfg.DATALOADER.NUM_WORKERS) target_loader = build_data_loaders(cfg.DATASETS.TARGETS, transforms=cfg.INPUT.TRANSFORMS_TRAIN, is_train=True, distributed=args.distributed, batch_size=cfg.SOLVER.BATCH_SIZE, num_workers=cfg.DATALOADER.NUM_WORKERS) test_loaders = build_data_loaders(cfg.DATASETS.TESTS, transforms=cfg.INPUT.TRANSFORMS_TEST, is_train=False, distributed=args.distributed, num_workers=cfg.DATALOADER.NUM_WORKERS) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = build_detectors(cfg) model.to(device) # dis_model = Discriminator(cfg) # dis_model.to(device) model_without_ddp = model # dis_model_without_ddp = dis_model if args.distributed: model = DistributedDataParallel(convert_sync_batchnorm(model), device_ids=[args.gpu]) # dis_model = DistributedDataParallel(dis_model, device_ids=[args.gpu]) model_without_ddp = model.module # dis_model_without_ddp = dis_model.module # optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], cfg.SOLVER.LR, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) optimizer = torch.optim.Adam( [p for p in model.parameters() if p.requires_grad], cfg.SOLVER.LR, betas=(0.9, 0.999), weight_decay=cfg.SOLVER.WEIGHT_DECAY) # dis_optimizer = torch.optim.Adam([p for p in dis_model.parameters() if p.requires_grad], cfg.SOLVER.LR, betas=(0.9, 0.999), weight_decay=cfg.SOLVER.WEIGHT_DECAY) schedulers = [ torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.SOLVER.STEPS, gamma=cfg.SOLVER.GAMMA), # torch.optim.lr_scheduler.MultiStepLR(dis_optimizer, cfg.SOLVER.STEPS, gamma=cfg.SOLVER.GAMMA), ] current_epoch = -1 if args.resume: print('Loading from {} ...'.format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if 'current_epoch' in checkpoint: current_epoch = int(checkpoint['current_epoch']) # if 'discriminator' in checkpoint: # dis_model_without_ddp.load_state_dict(checkpoint['discriminator']) work_dir = cfg.WORK_DIR if args.test_only: evaluation(model, test_loaders, device, types=cfg.TEST.EVAL_TYPES, output_dir=work_dir) return losses_writer = None if dist_utils.is_main_process(): losses_writer = SummaryWriter(os.path.join(work_dir, 'losses')) losses_writer.add_text('config', '{}'.format(str(cfg).replace('\n', ' \n'))) losses_writer.add_text('args', str(args)) metrics_writers = {} if dist_utils.is_main_process(): test_dataset_names = [ loader.dataset.dataset_name for loader in test_loaders ] for dataset_name in test_dataset_names: metrics_writers[dataset_name] = SummaryWriter( os.path.join(work_dir, 'metrics', dataset_name)) start_time = time.time() epochs = cfg.SOLVER.EPOCHS global total_steps start_epoch = current_epoch + 1 total_steps = (epochs - start_epoch) * len(train_loader) print("Start training, total epochs: {} ({} - {}), total steps: {}".format( epochs - start_epoch, start_epoch, epochs - 1, total_steps)) for epoch in range(start_epoch, epochs): if args.distributed: train_loader.batch_sampler.sampler.set_epoch(epoch) target_loader.batch_sampler.sampler.set_epoch(epoch) def test_func(): global best_mAP updated = False metrics = evaluation(model, test_loaders, device, cfg.TEST.EVAL_TYPES, output_dir=work_dir, iteration=global_step) if dist_utils.is_main_process() and losses_writer: for dataset_name, metric in metrics.items(): for k, v in metric.items(): metrics_writers[dataset_name].add_scalar( 'metrics/' + k, v, global_step=global_step) # if k == 'mAP' and v > best_mAP: if k == 'AP' and v > best_mAP: best_mAP = v updated = True model.train() return updated def save_func(filename=None, save_str=None): state_dict = { 'model': model_without_ddp.state_dict(), # 'discriminator': dis_model_without_ddp.state_dict(), 'current_epoch': epoch, } filename = filename if filename else 'model_epoch_{:02d}.pth'.format( epoch) save_path = os.path.join(work_dir, filename) dist_utils.save_on_master(state_dict, save_path) if dist_utils.is_main_process() and save_str is not None: with open(os.path.join(work_dir, 'best.txt'), 'w') as f: f.write(save_str) print('Saved to {}'.format(save_path)) epoch_start = time.time() train_one_epoch(model, optimizer, train_loader, target_loader, device, epoch, dis_model=None, dis_optimizer=None, writer=losses_writer, test_func=test_func, save_func=save_func) for scheduler in schedulers: scheduler.step() save_func() if epoch == (epochs - 1): test_func() epoch_cost = time.time() - epoch_start left = epochs - epoch - 1 print('Epoch {} ended, cost {}. Left {} epochs, may cost {}'.format( epoch, str(datetime.timedelta(seconds=int(epoch_cost))), left, str(datetime.timedelta(seconds=int(left * epoch_cost))))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Total training time {}'.format(total_time_str))
type=int, help='number of distributed processes') parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training') parser.add_argument("opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() dist_utils.init_distributed_mode(args) print(args) world_size = dist_utils.get_world_size() if world_size != 4: lr = cfg.SOLVER.LR * (float(world_size) / 4) print('Change lr from {} to {}'.format(cfg.SOLVER.LR, lr)) cfg.merge_from_list(['SOLVER.LR', lr]) print(cfg) os.makedirs(cfg.WORK_DIR, exist_ok=True) if dist_utils.is_main_process(): with open(os.path.join(cfg.WORK_DIR, 'config.yaml'), 'w') as fid: fid.write(str(cfg)) main(cfg, args)
def main(cfg, args): train_loader = build_data_loaders(cfg.DATASETS.TRAINS, transforms=cfg.INPUT.TRANSFORMS_TRAIN, is_train=True, distributed=args.distributed, batch_size=cfg.SOLVER.BATCH_SIZE, num_workers=cfg.DATALOADER.NUM_WORKERS) test_loaders = build_data_loaders(cfg.DATASETS.TESTS, transforms=cfg.INPUT.TRANSFORMS_TEST, is_train=False, distributed=args.distributed, num_workers=cfg.DATALOADER.NUM_WORKERS) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = build_detectors(cfg) model.to(device) model_without_ddp = model if args.distributed: model = DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module # optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], cfg.SOLVER.LR, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) optimizer = torch.optim.Adam( [p for p in model.parameters() if p.requires_grad], cfg.SOLVER.LR, betas=(0.9, 0.999), weight_decay=cfg.SOLVER.WEIGHT_DECAY) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, cfg.SOLVER.STEPS, gamma=cfg.SOLVER.GAMMA) if args.resume: print('Loading from {} ...'.format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) work_dir = cfg.WORK_DIR if args.test_only: evaluation(model, test_loaders, device, types=cfg.TEST.EVAL_TYPES, output_dir=work_dir) return losses_writer = None if dist_utils.is_main_process(): losses_writer = SummaryWriter(os.path.join(work_dir, 'losses')) losses_writer.add_text('config', '{}'.format(str(cfg).replace('\n', ' \n'))) losses_writer.add_text('args', str(args)) metrics_writers = {} if dist_utils.is_main_process(): test_dataset_names = [ loader.dataset.dataset_name for loader in test_loaders ] for dataset_name in test_dataset_names: metrics_writers[dataset_name] = SummaryWriter( os.path.join(work_dir, 'metrics', dataset_name)) print("Start training") start_time = time.time() epochs = cfg.SOLVER.EPOCHS for epoch in range(epochs): if args.distributed: train_loader.batch_sampler.sampler.set_epoch(epoch) epoch_start = time.time() train_one_epoch(model, optimizer, train_loader, device, epoch, writer=losses_writer) scheduler.step() state_dict = {'model': model_without_ddp.state_dict(), 'args': args} save_path = os.path.join(work_dir, 'model_epoch_{:02d}.pth'.format(epoch)) dist_utils.save_on_master(state_dict, save_path) print('Saved to {}.'.format(save_path)) metrics = evaluation(model, test_loaders, device, cfg.TEST.EVAL_TYPES, output_dir=work_dir, iteration=epoch) if dist_utils.is_main_process() and losses_writer: for dataset_name, metric in metrics.items(): for k, v in metric.items(): metrics_writers[dataset_name].add_scalar( 'metrics/' + k, v, global_step=global_step) epoch_cost = time.time() - epoch_start left = epochs - epoch - 1 print('Epoch {} ended, cost {}. Left {} epochs, may cost {}'.format( epoch, str(datetime.timedelta(seconds=int(epoch_cost))), left, str(datetime.timedelta(seconds=int(left * epoch_cost))))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Total training time {}'.format(total_time_str))
def do_evaluation(model, data_loader, device, types, output_dir, iteration=None, viz=False): model.eval() metric_logger = utils.MetricLogger(delimiter=" ") dataset = data_loader.dataset header = 'Testing {}:'.format(dataset.dataset_name) results_dict = {} has_mask = False for images, img_metas, targets in metric_logger.log_every(data_loader, 10, header): assert len(targets) == 1 images = images.to(device) model_time = time.time() det = model(images, img_metas)[0] boxes, scores, labels = det['boxes'], det['scores'], det['labels'] model_time = time.time() - model_time img_meta = img_metas[0] scale_factor = img_meta['scale_factor'] img_info = img_meta['img_info'] if viz: import matplotlib.pyplot as plt import matplotlib.patches as patches plt.switch_backend('TKAgg') image = de_normalize(images[0], img_meta) plt.subplot(122) plt.imshow(image) plt.title('Predict') for i, ((x1, y1, x2, y2), label) in enumerate(zip(boxes.tolist(), labels.tolist())): if scores[i] > 0.65: rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, facecolor='none', edgecolor='g') category_id = dataset.label2cat[label] plt.text(x1, y1, '{}:{:.2f}'.format(dataset.CLASSES[category_id], scores[i]), color='r') plt.gca().add_patch(rect) plt.subplot(121) plt.imshow(image) plt.title('GT') for i, ((x1, y1, x2, y2), label) in enumerate(zip(targets[0]['boxes'].tolist(), targets[0]['labels'].tolist())): category_id = dataset.label2cat[label] rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, facecolor='none', edgecolor='g') plt.text(x1, y1, '{}'.format(dataset.CLASSES[category_id])) plt.gca().add_patch(rect) plt.show() boxes /= scale_factor result = {} if 'masks' in det: has_mask = True (w, h) = img_meta['origin_img_shape'] masks = paste_masks_in_image(det['masks'], boxes, (h, w)) rles = [] for mask in masks.cpu().numpy(): mask = mask >= 0.5 mask = mask_util.encode(np.array(mask[0][:, :, None], order='F', dtype='uint8'))[0] # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the pycocotools/_mask.pyx does). mask['counts'] = mask['counts'].decode('utf-8') rles.append(mask) result['masks'] = rles boxes = boxes.tolist() labels = labels.tolist() labels = [dataset.label2cat[label] for label in labels] scores = scores.tolist() result['boxes'] = boxes result['scores'] = scores result['labels'] = labels # save_visualization(dataset, img_meta, result, output_dir) results_dict.update({ img_info['id']: result }) metric_logger.update(model_time=model_time) if get_world_size() > 1: dist.barrier() predictions = _accumulate_predictions_from_multiple_gpus(results_dict) if not is_main_process(): return {} results = {} if has_mask: result = coco_evaluation(dataset, predictions, output_dir, iteration=iteration) results.update(result) if 'voc' in types: result = voc_evaluation(dataset, predictions, output_dir, iteration=iteration, use_07_metric=False) results.update(result) return results