def main(): parser = argparse.ArgumentParser( description='SSD Evaluation on VOC and COCO dataset.') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, type=str, ) parser.add_argument("--output_dir", default="eval_results", type=str, help="The directory to store evaluation results.") parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) evaluation(cfg, ckpt=args.ckpt, distributed=distributed)
def train(cfg, args): # 工厂模式,加载日志文件设置,这里暂时不同管 logger = logging.getLogger('SSD.trainer') # 建立目标检测模型 model = build_detection_model(cfg) # 设置Device并且把模型部署到设备上 device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # 设置学习率、优化器还有学习率变化步长,可以理解为模拟退火这种,前面的步长比较大,后面的步长比较小 lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 # **** 这里应该是从断点开始对模型进行训练 **** checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) # Important 通过torch的形式去加载数据集 # 关键在于如何加载数据集,模型的构建过程可以简单地看成是黑盒 max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # 正式开始训练, 暂时先不训练? # 不对,不训练也得加载数据集**** 暂时不训练就完事了 *** 直接看数据加载过程 # model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # macs, params = profile(model, inputs=(input, )) # # macs, params = clever_format([flops, params], "%.3f") # net = model.to() # with torch.cuda.device(0): # net = model.to(device) # macs, params = get_model_complexity_info(net, (3, 512, 512), as_strings=True, # print_per_layer_stat=True, verbose=True) # print('{:<30} {:<8}'.format('Computational complexity: ', macs)) # print('{:<30} {:<8}'.format('Number of parameters: ', params)) n_params = sum(p.numel() for name, p in model.named_parameters() if p.requires_grad) print(n_params) # # model = net # inputs = torch.randn(1, 3, 300, 300) #8618 305 # inputs = torch.randn(1, 3, 300, 300) # macs = profile_macs(model, inputs) # print(macs) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load(args.ckpt) arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) logging.info('==>Start statistic') do_run(cfg, model, distributed=args.distributed) logging.info('==>End statistic') for ops in model.modules(): if isinstance(ops, torch.nn.ReLU): ops.collectStats = False # ops.c.data = ops.running_mean + (ops.running_b * laplace[args.actBitwidth]) ops.c.data = ops.running_mean + (3 * ops.running_std) ops.quant = True torch.cuda.empty_cache() model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def train(cfg: CfgNode, args: Namespace, output_dir: Path, model_manager: Dict[str, Any], freeze_non_sigma: bool = False): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) resume_from = checkpointer.get_best_from_experiment_dir(cfg) extra_checkpoint_data = checkpointer.load(f=resume_from) arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # Weight freezing test: # print_model(model) # freeze_weights(model) print_model(model) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args, output_dir, model_manager) return model
def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) # 建立模型 device = torch.device(cfg.MODEL.DEVICE) # 看cfg怎么组织的,把文件和args剥离开 model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) # model = nn.DataParallel(model) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) # 建立优化器 milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, save_dir=cfg.OUTPUT_DIR, save_to_disk=save_to_disk, logger=logger) # 建立模型存储载入类,给save_dir赋值表示 extra_checkpoint_data = checkpointer.load(f='', use_latest=False) # 载入模型 arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # 建立数据库 print("dataloader: ", train_loader.batch_size) # exit(1232) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) # 训练 return model
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() # #获得要剪枝的层 if cfg.PRUNE.TYPE != 'no': if hasattr(model, 'module'): backbone = model.module.backbone else: backbone = model.backbone if cfg.PRUNE.TYPE == 'normal': logger.info('normal sparse training') _, _, prune_idx = normal_prune.parse_module_defs( backbone.module_defs) elif cfg.PRUNE.TYPE == 'shortcut': logger.info('shortcut sparse training') _, _, prune_idx, _, _ = shortcut_prune.parse_module_defs( backbone.module_defs) model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() # 对要剪枝层的γ参数稀疏化 if cfg.PRUNE.TYPE != 'no': if hasattr(model, 'module'): bn_sparse.updateBN(model.module.backbone.module_list, cfg.PRUNE.SR, prune_idx) else: # print(model.backbone.module_list) bn_sparse.updateBN(model.backbone.module_list, cfg.PRUNE.SR, prune_idx) optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=False, iteration=iteration) #单gpu测试 if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def main(): parser = argparse.ArgumentParser( description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--log_step', default=10, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=2500, type=int, help='Save checkpoint every save_step') parser.add_argument( '--eval_step', default=2500, type=int, help='Evaluate dataset every eval_step, disabled when eval_step < 0') parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # Train distance regression network train_distance_regr() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() if cfg.OUTPUT_DIR: mkdir(cfg.OUTPUT_DIR) logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, distributed=args.distributed)
def main(): # 解析命令行 读取配置文件 ''' 规定了模型的基本参数,训练的类,一共是20类加上背景所以是21 模型的输入大小,为了不对原图造成影响,一般是填充为300*300的图像 训练的文件夹路径2007和2012,测试的文件夹路径2007 最大迭代次数为120000.学习率还有gamma的值,总之就是一系列的超参数 输出的文件目录 MODEL: NUM_CLASSES: 21 INPUT: IMAGE_SIZE: 300 DATASETS: TRAIN: ("voc_2007_trainval", "voc_2012_trainval") TEST: ("voc_2007_test", ) SOLVER: MAX_ITER: 120000 LR_STEPS: [80000, 100000] GAMMA: 0.1 BATCH_SIZE: 32 LR: 1e-3 OUTPUT_DIR: 'outputs/vgg_ssd300_voc0712' Returns: ''' parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="configs/vgg_ssd300_voc0712.yaml", # default="configs/vgg_ssd300_visdrone0413.yaml", metavar="FILE", help="path to config file", type=str, ) # 每2500步保存一次文件,并且验证一次文件,记录是每10次记录一次,然后如果不想看tensor的记录的话,可以关闭,使用的是tensorboardX parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--log_step', default=10, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=2500, type=int, help='Save checkpoint every save_step') parser.add_argument('--eval_step', default=2500, type=int, help='Evaluate dataset every eval_step, disabled when eval_step < 0') parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) # 参数解析,可以使用多GPU进行训练 args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus # 做一些启动前必要的检查 if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # 创建模型输出文件夹 if cfg.OUTPUT_DIR: mkdir(cfg.OUTPUT_DIR) # 使用logger来进行记录 logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) # 加载配置文件 logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) # 模型训练 # model = train(cfg, args) model = train(cfg, args) # 开始进行验证 if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, distributed=args.distributed)
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() # 模型设置为train()模式,表示参数是可以进行更新的 model.train() save_to_disk = dist_util.get_rank() == 0 # 这个是关于模型训练过程中的过程记录 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None # dataloader的大小,根据配置文件中的iteration进行训练 # arguments = {"iteration": 0},按照目前的理解是按照断点进行训练,这个表示的是当前的迭代次数这样 max_iter = len(data_loader) start_iter = arguments["iteration"] # 开始计时 start_training_time = time.time() end = time.time() # 一次训练中,数据长度应该是dataloader的大小,也就是按照batchsize进行分割之后的大小 # 数据集会返回图像和图像对应的标签,也就是(类别数目) (c+4)k,k个先验框、c个类别,然后加一个框的坐标位置 for iteration, (images, targets, _) in enumerate(data_loader, start_iter): # print(iteration) # print(targets) iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) # 把输入和目标输出传入模型,模型就会返回loss loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes # 这里是多GPU的操作,暂时先不用去理会 loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) # 这里是标准的反向传播的过程,传播就完事了 optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() # 记录时间、写日志、写模型然后保存训练中的过程记录之类的,这里也基本是死的,主要找到模型就完事了 batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) # 目前问题主要存在这个部分,就是利用模型进行验证的过程中会报错,验证的文件有错误 if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _, boxes_norm, labels_norm) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = targets.to(device) #+++++++++++++++++++++++++++++++++++++++++++++++ Mask GT ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mask_t = np.zeros((images.shape[0], 81, 64, 64)) mask_t[:, 0, :, :] = np.ones((1, 1, 64, 64)) for i in range(images.shape[0]): for L, B_norm in zip(labels_norm[i], boxes_norm[i]): xmin = int(B_norm[0] * 64) ymin = int(B_norm[1] * 64) xmax = int(B_norm[2] * 64) ymax = int(B_norm[3] * 64) lab = int(L) mask_t[i, 0, ymin:ymax, xmin:xmax] = 0.0 mask_t[i, lab, ymin:ymax, xmin:xmax] = 1.0 mask_t = Variable(torch.from_numpy((mask_t).astype(np.float32))).cuda() #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ loss_dict = model(images, targets=(targets, mask_t)) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() max_epoch = 10 for epoch in range(max_epoch): logger.info('epoch: {}'.format(epoch)) for iteration, (images, targets, _) in enumerate(data_loader, start_iter): # print("imgs shape: ",images.shape,iteration) # continue # iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) # log step if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar( 'losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) # save step if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) # eval step if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: # if True: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train_with_style(cfg, model, data_loader, style_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() # prepare AdaIN models default_path = '/content/drive/MyDrive/DA_detection/models/' vgg_path = default_path + 'vgg_normalized.pth' if 'VGG_PATH' in os.environ: vgg_path = os.environ['VGG_PATH'] decoder_path = default_path + 'decoder.pth' if 'DECODER_PATH' in os.environ: decoder_path = os.environ['DECODER_PATH'] # DEBUG: print('AdaIN > models loaded') for iteration, (images, targets, ids) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration # AdaIN routine random.seed() styles = next(iter(style_loader)) # DEBUG: print('AdaIN > begin new batch') if random.random() > args.p: apply_style_transfer(vgg_path, decoder_path, images, styles[0], args.p) # DEBUG: print('AdaIN > end batch') images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if device == "cuda": logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) else: logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train( cfg: CfgNode, model: SSDDetector, data_loader: DataLoader, optimizer: SGD, scheduler: MultiStepLR, checkpointer, device: device, arguments, args: Namespace, output_dir: Path, model_manager: Dict[str, Any], ) -> SSDDetector: logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter(logdir=output_dir / "logs") else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() logger.info("MAX_ITER: {}".format(max_iter)) # GB: 2019-09-08: # For rescaling tests, do an eval before fine-tuning-training, so we know what # the eval results are before any weights are updated: # do_evaluation( # cfg, # model, # distributed=args.distributed, # iteration=0, # ) # model.train() # *IMPORTANT*: change to train mode after eval. for iteration, (images, targets, _) in enumerate(data_loader, start_iter): # TODO: Print learning rate: iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", "{meters}", "eta: {eta}", "mem: {mem}M", ]).format( iter=iteration, lr=optimizer.param_groups[0]["lr"], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar("losses/total_loss", losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar( "losses/{}".format(loss_name), loss_item, global_step=global_step, ) summary_writer.add_scalar("lr", optimizer.param_groups[0]["lr"], global_step=global_step) # This project doesn't use epochs, it does something with batch samplers # instead, so there is only a concept of "iteration". For now hardcode epoch as # zero to put into file name: epoch = 0 save_name = f"ssd{cfg.INPUT.IMAGE_SIZE}-vgg_{cfg.DATASETS.TRAIN[0]}_0_{epoch}_{iteration:06d}" model_path = Path(output_dir) / f"{save_name}.pth" # Above if block would be replaced by this: if iteration % args.save_step == 0: checkpointer.save(save_name, **arguments) # Do eval when training, to trace the mAP changes and see performance improved # whether or nor if (args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter): eval_results = do_evaluation( cfg, model, distributed=args.distributed, iteration=iteration, ) do_best_model_checkpointing(cfg, model_path, eval_results, model_manager, logger) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric( eval_result["metrics"], "metrics/" + dataset, summary_writer, iteration, ) model.train() # *IMPORTANT*: change to train mode after eval. if iteration % args.save_step == 0: remove_extra_checkpoints(output_dir, [model_path], logger) checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def main(): parser = ArgumentParser( description="Single Shot MultiBox Detector Training With PyTorch") parser.add_argument( "--config-file", default="", metavar="FILE", help="config file name or path (relative to the configs/ folder) ", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--log_step", default=50, type=int, help="Print logs every log_step") parser.add_argument("--save_step", default=5000, type=int, help="Save checkpoint every save_step") parser.add_argument( "--eval_step", default=5000, type=int, help="Evaluate dataset every eval_step, disabled when eval_step < 0", ) parser.add_argument("--use_tensorboard", default=True, type=str2bool) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=REMAINDER, ) parser.add_argument( "--resume_experiment", default="None", dest="resume", type=str, help="Checkpoint state_dict file to resume training from", ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True else: cfg.MODEL.DEVICE = "cpu" if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() eman = ExperimentManager("ssd") output_dir = eman.get_output_dir() args.config_file = str( Path(__file__).parent / "configs" / args.config_file) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.OUTPUT_DIR = str(output_dir) cfg.freeze() eman.start({"cfg": cfg, "args": vars(args)}) # We use our own output dir, set by ExperimentManager: # if cfg.OUTPUT_DIR: # mkdir(cfg.OUTPUT_DIR) logger = setup_logger("SSD", dist_util.get_rank(), output_dir / "logs") logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) logger.info(f"Output dir: {output_dir}") model_manager = {"best": None, "new": None} model = train(cfg, args, output_dir, model_manager) if not args.skip_test: logger.info("Start evaluating...") torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation( cfg, model, distributed=args.distributed, ) do_best_model_checkpointing( cfg, output_dir / "model_final.pth", eval_results, model_manager, logger, is_final=True, ) eman.mark_dir_if_complete()