def train(cfg, local_rank, distributed, use_tensorboard=False, logger=None, start_iter=0): arguments = {"iteration": start_iter} data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if cfg.SOLVER.UNFREEZE_CONV_BODY: for p in model.backbone.parameters(): p.requires_grad = True optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer, start_iter) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk, logger=logger) print(cfg.TRAIN.IGNORE_LIST) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, ignore_list=cfg.TRAIN.IGNORE_LIST) arguments.update(extra_checkpoint_data) if cfg.SOLVER.KEEP_LR: optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer, start_iter) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD tensorboard_logdir = cfg.OUTPUT_DIR tensorboard_exp_name = cfg.TENSORBOARD_EXP_NAME snapshot = cfg.SOLVER.SNAPSHOT_ITERS do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, snapshot, tensorboard_logdir, tensorboard_exp_name, use_tensorboard=use_tensorboard ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) print(model) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = os.path.join(cfg.OUTPUT_DIR, cfg.FILE) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader_train = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) data_loader_val = make_data_loader( cfg, is_train=False, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD val_period = cfg.SOLVER.VAL_PERIOD do_train( model, data_loader_train, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, val_period, arguments, distributed, ) return model
def train(cfg, local_rank, distributed, tb_logger): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, resume=cfg.SOLVER.RESUME) if cfg.SOLVER.RESUME: arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, tb_logger, cfg, local_rank, ) return model
def test(self, output_dir=None, model_to_test=None): if output_dir is not None: self.cfg.OUTPUT_DIR = output_dir model = build_detection_model(self.cfg) device = torch.device(self.cfg.MODEL.DEVICE) model.to(device) arguments = {} arguments["iteration"] = 0 output_dir = self.cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( self.cfg, model, None, None, output_dir, save_to_disk ) if model_to_test is not None: self.cfg.MODEL.WEIGHT = model_to_test if self.cfg.MODEL.WEIGHT.startswith('/') or 'catalog' in self.cfg.MODEL.WEIGHT: model_path = self.cfg.MODEL.WEIGHT else: model_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir, os.path.pardir, 'Data', 'pretrained_feature_extractors', self.cfg.MODEL.WEIGHT)) extra_checkpoint_data = checkpointer.load(model_path, use_latest=False) checkpointer.optimizer = make_optimizer(self.cfg, checkpointer.model) checkpointer.scheduler = make_lr_scheduler(self.cfg, checkpointer.optimizer) # Initialize mixed-precision training use_mixed_precision = self.cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(checkpointer.model, checkpointer.optimizer, opt_level=amp_opt_level) if self.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.local_rank], output_device=self.local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(self.cfg, is_train=False, is_distributed=(get_world_size() > 1), is_target_task=self.is_target_task), dataset_name="[Test]", iou_types=("bbox",), box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, is_target_task=self.is_target_task, ) synchronize() logger = logging.getLogger("maskrcnn_benchmark") logger.handlers=[]
def __init__( self, cfg, confidence_threshold=0.7, show_mask_heatmaps=False, masks_per_dim=2, min_image_size=224, ): self.cfg = cfg.clone() self.model = build_detection_model(cfg) self.model.eval() self.device = torch.device(cfg.MODEL.DEVICE) self.model.to(self.device) self.min_image_size = min_image_size save_dir = cfg.OUTPUT_DIR optimizer = make_optimizer(cfg, self.model) scheduler = make_lr_scheduler(cfg, optimizer) checkpointer = DetectronCheckpointer(cfg, self.model, optimizer=optimizer, scheduler=scheduler, save_dir=save_dir) # checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=save_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) self.transforms = self.build_transform() mask_threshold = -1 if show_mask_heatmaps else 0.5 self.masker = Masker(threshold=mask_threshold, padding=1) # used to make colors for each class self.palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) self.cpu_device = torch.device("cpu") self.confidence_threshold = confidence_threshold self.show_mask_heatmaps = show_mask_heatmaps self.masks_per_dim = masks_per_dim
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR summary_writer = SummaryWriter(log_dir=output_dir) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) if cfg.MODEL.WEIGHT.upper() == 'CONTINUE': model_weight = last_checkpoint(output_dir) else: model_weight = cfg.MODEL.WEIGHT extra_checkpoint_data = checkpointer.load(model_weight) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)[0] checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train(model=model, data_loader=data_loader, data_loader_val=data_loader_val, optimizer=optimizer, scheduler=scheduler, checkpointer=checkpointer, device=device, checkpoint_period=checkpoint_period, arguments=arguments, summary_writer=summary_writer) return model
def __init__(self, exp_dict): super().__init__() cfg_base_path = "./models/configs/" self.n_classes = 21 cfg_path = cfg_base_path + "e2e_mask_rcnn_R_50_FPN_1x.yaml" self.cfg = cfg self.cfg.merge_from_file(cfg_path) self.cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES = self.n_classes # --------------- # build model self.backbone_fpn = backbone.build_backbone(self.cfg) self.rpn = rpn.build_rpn(self.cfg, self.backbone_fpn.out_channels) self.roi_heads = roi_heads.build_roi_heads( self.cfg, self.backbone_fpn.out_channels) # --------------- # load checkpoint checkpoint = _load_file(self.cfg) load_state_dict(self, checkpoint.pop("model")) #-------- # Opt stage self.cfg.SOLVER.BASE_LR = ((0.0025 * 8) / (16 / float(exp_dict["batch_size"]))) optimizer = make_optimizer(self.cfg, self) scheduler = make_lr_scheduler(self.cfg, optimizer) self.opt = optimizer self.scheduler = scheduler
def comput_on_dataset_with_finetune(model, data_loader, device, cfg): results_dict = {} cpu_device = torch.device("cpu") optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) for i, batch in tqdm(enumerate(data_loader)): # assert len(batch[0]) == 1, "Finetune only Support batchsize = 1" dataset = data_loader.dataset video_id = dataset.get_annotation_video_id(i) img_id = dataset.get_annotation_img_id(i) images, targets, image_ids = batch images = images.to(device) targets = [target.to(device) for target in targets] if img_id == "00000": logger = logging.getLogger("DAVIS_MaskRCNN_baseline_test") logger.info('-' * 50) logger.info("Fintune: {}".format(video_id)) logger.info('-' * 50) model = finetune_first_image(model, images, targets, optimizer, scheduler, logger, cfg) model.eval() with torch.no_grad(): output = model(images) output = [o.to(cpu_device) for o in output] results_dict.update( {img_id: result for img_id, result in zip(image_ids, output)} ) return results_dict
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if cfg.USE_TENSORBOARD_LOGS: meters = TensorboardLogger( log_dir=os.path.join(output_dir, 'tensorboard_logs'), start_iter=arguments['iteration'], delimiter=" ", ) else: meters = MetricLogger(delimiter=" ") do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, meters, ) return model
def train(cfg, local_rank, distributed): # 创建GeneralizedRCNN()对象 # detectors.py --> generalized_rcnn.py model = build_detection_model(cfg) # print(model) # 'cpu' or 'cuda' device = torch.device(cfg.MODEL.DEVICE) model.to(device) # 封装了 torch.optiom.SGD() 函数, 根据tensor的requires_grad属性构成需要更新的参数列表 optimizer = make_optimizer(cfg, model) # 根据配置信息设置 optimizer 的学习率更新策略 scheduler = make_lr_scheduler(cfg, optimizer) # 分布式训练情况下, 并行处理数据 if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 # 获取输出的文件夹路径, 默认为 '.' output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) # 字典的update方法, 对字典的键值进行更新 data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) if checkpointer.classes is None: for ds in data_loader.dataset.datasets: ds.find_classes() checkpointer.classes = data_loader.dataset.datasets[0].class_to_ind else: print("Loading classes from file") print(checkpointer.classes) for ds in data_loader.dataset.datasets: ds.class_to_ind = checkpointer.classes checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): # use following line to avoid shared file limit # torch.multiprocessing.set_sharing_strategy('file_system') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Convert Model for SyncBN if cfg.SYNCBN: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats # broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): # ################################################################### fusion_factors # add by G if cfg.MODEL.FPN.STATISTICS_ALPHA_ON == True: sta_module = StaAlphaModule(cfg) fusion_factors = sta_module.process() else: fusion_factors = cfg.MODEL.FPN.FUSION_FACTORS # ################################################################### fusion_factors # add by G model = build_detection_model(cfg, fusion_factors) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed, use_tensorboard=False): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 if use_tensorboard: arguments["tb_log_dir"] = cfg.TENSORBOARD_LOGDIR arguments["tb_exp_name"] = cfg.TENSORBOARD_EXP_NAME output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) print(data_loader.dataset) for iteration, (images, targets, _) in enumerate(data_loader, 0): print(">>>>> train iteration:", iteration) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) #HACK: force the steps, could not change the lr from ckpt now. scheduler.milestones = cfg.SOLVER.STEPS # change lr #lr_ratio = cfg.SOLVER.BASE_LR / scheduler.base_lrs[-1] #scheduler.base_lrs = [ base_lr * lr_ratio for base_lr in self.base_lrs ] data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD tbwriter = SummaryWriter(cfg.OUTPUT_DIR) do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, tbwriter, ) return model
def train(cfg, local_rank, distributed, experiment=None): if not cfg.TASK.KIND in kind_builder: raise Exception('unknown task: {0}'.format(cfg.TASK.KIND)) model_builder = kind_builder[cfg.TASK.KIND] model = model_builder(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) # todo, make this relative? extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, experiment=experiment) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if cfg.MODEL.USE_SYNCBN: assert is_pytorch_1_1_0_or_later(), \ "SyncBatchNorm is only available in pytorch >= 1.1.0" model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) print('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa') optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR print('bbbbbbbbbbbbbbbbbbbbb') save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, resume=cfg.SOLVER.RESUME) if cfg.SOLVER.RESUME: arguments.update(extra_checkpoint_data) print('ccccccccccccccccccccccccccc') data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) print('dddddddddddddddddddddddddddddd') checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD tb_logger = Logger(cfg.OUTPUT_DIR) do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, tb_logger, cfg, ) print('eeeeeeeeeeeeeeeeeeeeeeeeeee') return model
def train(cfg, train_dir, local_rank, distributed, logger): # build model model = build_siammot(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False, find_unused_parameters=True) arguments = {} arguments["iteration"] = 0 save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, train_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = build_train_data_loader( cfg, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD tensorboard_writer = TensorboardWriter(cfg, train_dir) do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, logger, tensorboard_writer) return model
def train(cfg, local_rank, distributed, save_path='.', writer=None): # cfg.SOLVER.IMS_PER_BATCH =3# force it to 3 model_s = build_detection_model(cfg, is_student=True) model_t = build_detection_model(cfg, is_teacher=True) device_t = torch.device('cuda:0') device_s = torch.device('cuda:0') model_s.to(device_s) model_t.to(device_t) optimizer = make_optimizer(cfg, model_s) scheduler = make_lr_scheduler(cfg, optimizer) output_dir = save_path save_to_disk = get_rank() == 0 checkpointer_s = DetectronCheckpointer(cfg, model_s, optimizer, scheduler, output_dir, save_to_disk) checkpointer_t = DetectronCheckpointer(cfg, model_t, optimizer=None, scheduler=scheduler, save_dir=output_dir, save_to_disk=save_to_disk) _init_weight = 'e2e_mask_rcnn_R_50_FPN_1x.pth' _ = checkpointer_s.load(_init_weight, True) _ = checkpointer_t.load(_init_weight, True) sourceDataLoader = make_mt_data_loader(cfg, is_train=True, is_distributed=distributed, start_iter=0, mode='source', img_ratio=1 / 2) data_loader_dict = { 'source': sourceDataLoader, } if cfg.DATASETS.NO_LABEL: noLabelDataLoader = make_mt_data_loader(cfg, is_train=True, is_distributed=distributed, start_iter=0, mode='no_label', img_ratio=1 / 2) data_loader_dict.update({'no_label': noLabelDataLoader}) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD trainer = MTtrainer(model_s,model_t,data_loader_dict,optimizer, scheduler, checkpointer_s,checkpointer_t,\ checkpoint_period, cfg) trainer.train() return model_s
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.deprecated.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def load(self, f=None): # if self.has_checkpoint(): # # override argument with existing checkpoint # f = self.get_checkpoint_file() if not f: # no checkpoint could be found self.logger.info( "No checkpoint found. Initializing model from scratch") log_optimizer_scheduler_info(self.logger, self.optimizer, self.scheduler) return {} self.logger.info("Loading checkpoint from {}".format(f)) checkpoint = self._load_file(f) self._load_model(checkpoint) if self.cfg.PRIORITY_CONFIG: temp_optimizer = make_optimizer(self.cfg, self.model) self.optimizer.load_state_dict(temp_optimizer.state_dict()) for group in self.optimizer.param_groups: group.setdefault('initial_lr', group['lr']) iteration = checkpoint[ 'iteration'] if 'iteration' in checkpoint else 0 last_epoch = iteration - 1 temp_scheduler = make_lr_scheduler(self.cfg, self.optimizer, last_epoch=last_epoch) self.scheduler.load_state_dict(temp_scheduler.state_dict()) # remove processed stat data for stat_name in ["optimizer", "scheduler"]: if stat_name in checkpoint: checkpoint.pop(stat_name) else: if "optimizer" in checkpoint and self.optimizer: self.logger.info("Loading optimizer from {}".format(f)) self.optimizer.load_state_dict(checkpoint.pop("optimizer")) if "scheduler" in checkpoint and self.scheduler: self.logger.info("Loading scheduler from {}".format(f)) self.scheduler.load_state_dict(checkpoint.pop("scheduler")) if self.optimizer and self.scheduler: log_optimizer_scheduler_info(self.logger, self.optimizer, self.scheduler) # return any further checkpoint data return checkpoint
def train(cfg, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD warmup_layers = tuple(x for x in cfg.SOLVER.WARMUP_LAYERS if len(x) != 0) warmup_iters = cfg.SOLVER.WARMUP_ITERS do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, warmup_layers, warmup_iters ) return model
def fit(self, train_data, test_data, classes): self.classes = classes optimizer = make_optimizer(self.cfg, self.model) scheduler = make_lr_scheduler(self.cfg, optimizer) arguments = {} arguments["iteration"] = 0 self.checkpointer.classes = classes self.checkpointer.optimizer = optimizer self.checkpointer.scheduler = scheduler train_data_loader = make_data_loader( self.cfg, train_data, classes, is_train=True, is_distributed=False, start_iter=arguments["iteration"], ) test_data_loader = make_data_loader( self.cfg, test_data, classes, is_train=True, is_distributed=False, start_iter=arguments["iteration"], ) self.train_meter, self.test_meter = do_train( self.model, train_data_loader, test_data_loader, optimizer, scheduler, self.device, arguments, )
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) # model_D = build_discriminator(cfg) model_D = model.get_discriminator() device = torch.device(cfg.MODEL.DEVICE) model.to(device) # model_D.to(device) models = [model, model_D] optimizers = [] for p in model_D.parameters(): p.requires_grad = False optimizers.append(make_optimizer(cfg, model)) for p in model_D.parameters(): p.requires_grad = True optimizers.append(make_optimizer(cfg, model_D)) schedulers = [ make_lr_scheduler(cfg, optimizer) for optimizer in optimizers ] # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' models, optimizers = zip(*[ amp.initialize(model, optimizer, opt_level=amp_opt_level) for model, optimizer in zip(models, optimizers) ]) if distributed: models = [ torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) for model in models ] arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointers = [ DetectronCheckpointer(cfg, model, optimizers[0], schedulers[0], output_dir, save_to_disk), DetectronCheckpointer(cfg, model_D, optimizers[1], schedulers[1], output_dir, False), ] extra_checkpoint_data = [ checkpointer.load(cfg.MODEL.WEIGHT) for checkpointer in checkpointers ] arguments.update(extra_checkpoint_data[0]) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD tflogger = SummaryWriter(log_dir=os.path.join(output_dir, "logs")) do_train( models, data_loader, optimizers, schedulers, checkpointers, device, checkpoint_period, arguments, tflogger, ) return model
def train(self, output_dir=None, fine_tune_last_layers=False, fine_tune_rpn=False): if output_dir is not None: self.cfg.OUTPUT_DIR = output_dir model = build_detection_model(self.cfg) device = torch.device(self.cfg.MODEL.DEVICE) model.to(device) arguments = {} arguments["iteration"] = 0 output_dir = self.cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(self.cfg, model, None, None, output_dir, save_to_disk) if self.cfg.MODEL.WEIGHT.startswith( '/') or 'catalog' in self.cfg.MODEL.WEIGHT: model_path = self.cfg.MODEL.WEIGHT else: model_path = os.path.abspath( os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir, os.path.pardir, 'Data', 'pretrained_feature_extractors', self.cfg.MODEL.WEIGHT)) extra_checkpoint_data = checkpointer.load(model_path) if self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1 != self.cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES: checkpointer.model.roi_heads.box.predictor.cls_score = torch.nn.Linear( in_features=checkpointer.model.roi_heads.box.predictor. cls_score.in_features, out_features=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1, bias=True) checkpointer.model.roi_heads.box.predictor.bbox_pred = torch.nn.Linear( in_features=checkpointer.model.roi_heads.box.predictor. cls_score.in_features, out_features=(self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1) * 4, bias=True) if hasattr(checkpointer.model.roi_heads, 'mask'): checkpointer.model.roi_heads.mask.predictor.mask_fcn_logits = torch.nn.Conv2d( in_channels=checkpointer.model.roi_heads.mask.predictor. mask_fcn_logits.in_channels, out_channels=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1, kernel_size=(1, 1), stride=(1, 1)) checkpointer.model.to(device) if fine_tune_last_layers: checkpointer.model.roi_heads.box.predictor.cls_score = torch.nn.Linear( in_features=checkpointer.model.roi_heads.box.predictor. cls_score.in_features, out_features=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1, bias=True) checkpointer.model.roi_heads.box.predictor.bbox_pred = torch.nn.Linear( in_features=checkpointer.model.roi_heads.box.predictor. cls_score.in_features, out_features=(self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1) * 4, bias=True) if hasattr(checkpointer.model.roi_heads, 'mask'): checkpointer.model.roi_heads.mask.predictor.mask_fcn_logits = torch.nn.Conv2d( in_channels=checkpointer.model.roi_heads.mask.predictor. mask_fcn_logits.in_channels, out_channels=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1, kernel_size=(1, 1), stride=(1, 1)) # Freeze backbone layers for elem in checkpointer.model.backbone.parameters(): elem.requires_grad = False if not fine_tune_rpn: # Freeze RPN layers for elem in checkpointer.model.rpn.parameters(): elem.requires_grad = False else: for elem in checkpointer.model.rpn.head.conv.parameters(): elem.requires_grad = False checkpointer.model.rpn.head.cls_logits = torch.nn.Conv2d( in_channels=checkpointer.model.rpn.head.cls_logits. in_channels, out_channels=checkpointer.model.rpn.head.cls_logits. out_channels, kernel_size=(1, 1), stride=(1, 1)) checkpointer.model.rpn.head.bbox_pred = torch.nn.Conv2d( in_channels=checkpointer.model.rpn.head.bbox_pred. in_channels, out_channels=checkpointer.model.rpn.head.bbox_pred. out_channels, kernel_size=(1, 1), stride=(1, 1)) # Freeze roi_heads layers with the exception of the predictor ones for elem in checkpointer.model.roi_heads.box.feature_extractor.parameters( ): elem.requires_grad = False for elem in checkpointer.model.roi_heads.box.predictor.parameters( ): elem.requires_grad = True if hasattr(checkpointer.model.roi_heads, 'mask'): for elem in checkpointer.model.roi_heads.mask.predictor.parameters( ): elem.requires_grad = False for elem in checkpointer.model.roi_heads.mask.predictor.mask_fcn_logits.parameters( ): elem.requires_grad = True checkpointer.model.to(device) checkpointer.optimizer = make_optimizer(self.cfg, checkpointer.model) checkpointer.scheduler = make_lr_scheduler(self.cfg, checkpointer.optimizer) # Initialize mixed-precision training use_mixed_precision = self.cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(checkpointer.model, checkpointer.optimizer, opt_level=amp_opt_level) if self.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.local_rank], output_device=self.local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) data_loader = make_data_loader(self.cfg, is_train=True, is_distributed=self.distributed, start_iter=arguments["iteration"], is_target_task=self.is_target_task) test_period = self.cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader( self.cfg, is_train=False, is_distributed=self.distributed, is_target_task=self.is_target_task) else: data_loader_val = None checkpoint_period = self.cfg.SOLVER.CHECKPOINT_PERIOD do_train(self.cfg, model, data_loader, data_loader_val, checkpointer.optimizer, checkpointer.scheduler, checkpointer, device, checkpoint_period, test_period, arguments, is_target_task=self.is_target_task) logger = logging.getLogger("maskrcnn_benchmark") logger.handlers = []
def main(): num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) # model = train(cfg, args.local_rank, args.distributed) model = build_detection_model(cfg) # add print(model) all_index = [] for index, item in enumerate(model.named_parameters()): all_index.append(index) print(index) print(item[0]) print(item[1].size()) print("All index of the model: ", all_index) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=args.distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # run_test(cfg, model, args.distributed) # pruning m = Mask(model) m.init_length() m.init_length() print("-" * 10 + "one epoch begin" + "-" * 10) print("remaining ratio of pruning : Norm is %f" % args.rate_norm) print("reducing ratio of pruning : Distance is %f" % args.rate_dist) print("total remaining ratio is %f" % (args.rate_norm - args.rate_dist)) m.modelM = model m.init_mask(args.rate_norm, args.rate_dist) m.do_mask() m.do_similar_mask() model = m.modelM m.if_zero() # run_test(cfg, model, args.distributed) # change to use straightforward function to make its easy to implement Mask # do_train( # model, # data_loader, # optimizer, # scheduler, # checkpointer, # device, # checkpoint_period, # arguments, # ) logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) # print("Loss dict",loss_dict) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() # prun # Mask grad for iteration m.do_grad_mask() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) # prun # 7375 is number iteration to train 1 epoch with batch-size = 16 and number train dataset exam is 118K (in coco) if iteration % args.iter_pruned == 0 or iteration == cfg.SOLVER.MAX_ITER - 5000: m.modelM = model m.if_zero() m.init_mask(args.rate_norm, args.rate_dist) m.do_mask() m.do_similar_mask() m.if_zero() model = m.modelM if args.use_cuda: model = model.cuda() #run_test(cfg, model, args.distributed) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter))) if not args.skip_test: run_test(cfg, model, args.distributed)
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.deprecated.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) logger = logging.getLogger("Training") with tools.TimerBlock("Loading Experimental setups", logger) as block: exp_name = cfg.EXP.NAME output_dir = tools.get_exp_output_dir(exp_name, cfg.OUTPUT_DIR) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD validation_period = cfg.SOLVER.VALIDATION_PERIOD with tools.TimerBlock("Loading Checkpoints...", logger) as block: arguments = {} save_to_disk = local_rank == 0 checkpointer = Checkpointer( model, save_dir=output_dir, save_to_disk=save_to_disk, num_class=cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES, ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) arguments["iteration"] = 0 with tools.TimerBlock("Initializing DAVIS Datasets", logger) as block: logger.info("Loading training set...") data_loader_train = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) logger.info("Loading valid set...") data_loaders_valid = make_data_loader( cfg, is_train=False, is_distributed=distributed, ) do_train( model, data_loader_train, data_loaders_valid[0], optimizer, scheduler, checkpointer, device, checkpoint_period, validation_period, arguments, exp_name, ) return model
synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.MODEL.RPN.RNN.COMBINATION = 'attention_norm' cfg.OUTPUT_DIR = 'stage3_attention_norm' cfg.DATASETS.USE_ANNO_CACHE = False cfg.freeze() model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR
def train(cfg, local_rank, distributed): model, head = build_dist_face_trainer(cfg, local_rank) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if cfg.MODEL.USE_SYNCBN: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if True: model = FaceDistributedDataParallel( model, device_ids=local_rank, output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, chunk_sizes=None, #[32,56,56,56] ) head_local_rank = None if len(local_rank) == 1: head_local_rank = local_rank head = FaceDistributedDataParallel( head, device_ids=head_local_rank, output_device=head_local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) model = torch.nn.Sequential(*[model, head]) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # head_optimizer = make_optimizer(cfg, head) # head_scheduler = make_lr_scheduler(cfg, head_optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) # head, head_optimizer = amp.initialize(head, head_optimizer, opt_level=amp_opt_level) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) # head_checkpointer = DetectronCheckpointer( # cfg, head, head_optimizer, head_scheduler, output_dir, save_to_disk # ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) #### init transforms ##### transforms = T.Compose([ T.RandomCrop((cfg.INPUT.SIZE_TRAIN[0], cfg.INPUT.SIZE_TRAIN[1])), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=cfg.INPUT.RGB_MEAN, std=cfg.INPUT.RGB_STD), ]) data_loader = make_face_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], transforms=transforms, ) test_period = cfg.SOLVER.TEST_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD divs_nums = cfg.SOLVER.DIVS_NUMS_PER_BATCH do_face_train_dist_DIV_FC( cfg, model, #[model,head], data_loader, None, optimizer, #[optimizer,head_optimizer], scheduler, #[scheduler,head_scheduler], checkpointer, #[checkpointer,head_checkpointer], device, checkpoint_period, test_period, arguments, divs_nums, ) return model
def train(cfg, local_rank, distributed, loop, only_test, min_loss): ay = cfg.TEST.EVAL_AUG_THICKNESS_Y_TAR_ANC az = cfg.TEST.EVAL_AUG_THICKNESS_Z_TAR_ANC EVAL_AUG_THICKNESS = { 'target_Y': ay[0], 'anchor_Y': ay[1], 'target_Z': az[0], 'anchor_Z': az[1], } model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) roi_only = cfg.MODEL.ROI__ONLY if roi_only: freeze_rpn_layers(model) optimizer = make_optimizer(cfg, model) arguments = {} arguments["iteration"] = 0 data_loader = make_data_loader(cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"]) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk, roi_only=roi_only) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) if only_test: return model, min_loss checkpoint_period = int(cfg.SOLVER.CHECKPOINT_PERIOD_EPOCHS * cfg.INPUT.Example_num / cfg.SOLVER.IMS_PER_BATCH) epochs_between_test = cfg.SOLVER.EPOCHS_BETWEEN_TEST loss_weights = cfg.MODEL.LOSS.WEIGHTS for e in range(epochs_between_test): min_loss = do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, e + loop * epochs_between_test, cfg.DEBUG.eval_in_train, output_dir, cfg.DEBUG.eval_in_train_per_iter, cfg.TEST.IOU_THRESHOLD, min_loss, eval_aug_thickness=EVAL_AUG_THICKNESS, loss_weights=loss_weights) pass return model, min_loss