def benchmark_train(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if comm.get_world_size() > 1: model = DistributedDataParallel( <<<<<<< HEAD model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True ======= model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True >>>>>>> 7b936afd5b423c3188687d8b529a984bed528a87 ) optimizer = build_optimizer(cfg, model) checkpointer = DetectionCheckpointer(model, optimizer=optimizer) checkpointer.load(cfg.MODEL.WEIGHTS) cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 data_loader = build_detection_train_loader(cfg) dummy_data = list(itertools.islice(data_loader, 100)) def f(): data = DatasetFromList(dummy_data, copy=False) while True: yield from data max_iter = 400 trainer = SimpleTrainer(model, f(), optimizer) trainer.register_hooks( [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])] ) trainer.train(1, max_iter)
def local_master_get_detection_dataset_dicts(*args, **kwargs): logger.info("Only load dataset dicts on local master process ...") dataset_dicts = (d2_get_detection_dataset_dicts(*args, **kwargs) if comm.get_local_rank() == 0 else []) comm.synchronize() dataset_size = comm.all_gather(len(dataset_dicts))[0] if comm.get_local_rank() != 0: dataset_dicts = _FakeListObj(dataset_size) return dataset_dicts
def run(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) # count number of parameters for model net_params = model.parameters() weight_count = 0 for param in net_params: weight_count += np.prod(param.size()) logger.info("Number of model parameters: %.0f" % weight_count) if cfg.EVAL_ONLY: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=False) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) return do_train(cfg, model, resume=args.resume)
def main(sm_args): cfg = _setup(sm_args) model = build_model(cfg) # Converting string params to boolean flags as Sagemaker doesn't support currently boolean flags as hyperparameters. eval_only = True if sm_args.eval_only == "True" else False resume = True if sm_args.resume == "True" else False if eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) do_train(cfg, model, resume=resume) do_test(cfg, model) if sm_args.current_host == sm_args.hosts[0]: return _save_model(model)
def __init__(self, cfg): """ Args: cfg (CfgNode): """ logger = logging.getLogger("detectron2") if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 setup_logger() # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) super().__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def __init__(self, cfg): super().__init__(cfg) # init Meta Reweighter learner = Learner(cfg.MODEL.ROI_HEADS.NUM_CLASSES) learner.to(torch.device(cfg.MODEL.DEVICE)) if comm.get_world_size() > 1: learner = DistributedDataParallel( learner, device_ids=[comm.get_local_rank()], broadcast_buffers=False, ) self.learner = learner if comm.get_world_size() > 1: box_predictor = self.model.module.roi_heads.box_predictor else: box_predictor = self.model.roi_heads.box_predictor if isinstance(box_predictor, torch.nn.ModuleList): for predictor in box_predictor: predictor.register_meta_reweigher(self.learner) else: box_predictor.register_meta_reweigher(self.learner) self.optimizer_meta = torch.optim.Adam(self.learner.parameters(), lr=0.01) meta_data_loader = self.build_meta_loader(cfg) self._meta_data_loader_iter = iter(meta_data_loader)
def main(args): cfg = setup(args) for d in ["train", 'val']: # train for 6998images , val for 1199 images DatasetCatalog.register( "chefCap_" + d, lambda d=d: get_chefcap_image_dicts("data/" + d)) MetadataCatalog.get("chefCap_" + d).set( thing_classes=['face-head', 'mask-head', 'face-cap', 'mask-cap']) if d == 'val': MetadataCatalog.get("chefCap_val").evaluator_type = "pascal_voc" MetadataCatalog.get("chefCap_val").year = 2012 MetadataCatalog.get( "chefCap_val").dirname = "/opt/work/chefCap/data/val" model = build_model(cfg) logger.info("Model:\n{}".format(model)) # if args.eval_only: # DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( # cfg.MODEL.WEIGHTS, resume=args.resume # ) # return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) return do_train(cfg, model, resume=args.resume)
def main(args): cfg = setup(args) model = build_model(cfg) register_coco_instances("firevysor_train", {}, "data/Split_CleanedImage/train_annot.json", "data/Split_CleanedImage/train") register_coco_instances("firevysor_val", {}, "data/Split_CleanedImage/val_annot.json", "data/Split_CleanedImage/val") register_coco_instances("hardcases_val", {}, "data/annotations/hard_cases.json", "data/hard_cases") if args.eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) do_train(cfg, model, val_set='firevysor_val') return do_test(cfg, model)
def __init__(self, cfg): """ Args: cfg (CfgNode): Use the custom checkpointer, which loads other backbone models with matching heuristics. """ cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) TrainerBase.__init__(self) self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) self.checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def main(args): cfg = setup(args) # Technically, we should have used a dataset registry sort of object for these mappings. # Kinda like how ~/detectron2/data/datasets/builtin.py works. if cfg.DATASETS.TRAIN == "COCO_n_LVIS_train": JSON_ANNOTATION = "./datasets/coco/annotations/COCO_n_LVIS/COCO_n_LVIS_train.json" IMG_ROOT = "./datasets/coco/train2017" elif cfg.DATASETS.TRAIN == "LVIS80_train": JSON_ANNOTATION = "./datasets/coco/annotations/LVIS80/LVIS80_train.json" IMG_ROOT = "./datasets/coco/train2017" else: raise NotImplementedError("Unknown custom dataset: {}".format( cfg.DATASETS.TRAIN)) from detectron2.data.datasets import register_coco_instances register_coco_instances(DATA, {}, JSON_ANNOTATION, IMG_ROOT) model = build_model(cfg) logger.info("Model:\n{}".format(model)) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) do_train(cfg, model) return OrderedDict( ) # This is what do_test() would have returned in absence of any testing.
def __init__(self, cfg): self.logger = logging.getLogger("detectron2") model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) self.model = model self.optimizer = optimizer self.al_dataset = self.build_al_dataset(cfg) self.object_fusion = ObjectFusion(cfg) # It should be moved to ObjectActiveLearningTrainer later when # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.cfg = cfg
def __init__(self, cfg): super().__init__(cfg) """ Args: cfg (CfgNode): Use the custom checkpointer, which loads other backbone models with matching heuristics. """ # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) #super(DefaultTrainer, self).__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = AdetCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def reset_model(self, cfg, model): """ :return: except data_loader, reset the model """ if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) del self.model self.model = model optimizer = self.build_optimizer(cfg, model) del self.optimizer self.optimizer = optimizer scheduler = self.build_lr_scheduler(cfg, optimizer) del self.scheduler self.scheduler = scheduler checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) del self.checkpointer self.checkpointer = checkpointer self.start_iter = 0 # self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self._hooks = [] self.register_hooks(self.build_hooks())
def main(args): # Create the config file cfg = setup(args) # Build the model model = build_model(cfg) # Log what's going on logger.info("Model:\n{}".format(model)) # TODO: Fix this (if it doesn't work) #wandb.watch(model, log="all") # Only do evaluation if the args say so if args.eval_only: DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) return do_test(cfg, model) # Do distributed training? (depends on number of GPUs available) distributed = comm.get_world_size() > 1 if distributed: # Put the model on multiple devices if available model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) # Train the model do_train(cfg, model) # TODO - May want to evaluate in a different step? return do_test(cfg, model)
def __init__(self, cfg, model=None, data_loader=None): if model is not None: model = model else: model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) if data_loader is not None: self.data_loader = data_loader self.data_len = data_loader.dataset._dataset._lst else: self.data_loader, self.data_len = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) super().__init__(model, self.data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = int((270000 * self.data_len) / 45174) self.cfg = cfg self.register_hooks(self.build_hooks())
def __init__(self, cfg): """ Args: cfg (CfgNode): Use the custom checkpointer, which loads other backbone models with matching heuristics. """ # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # Load GAN model generator = esrgan_model.GeneratorRRDB(channels=3, filters=64, num_res_blocks=23).to(device) discriminator = esrgan_model.Discriminator( input_shape=(3, *hr_shape)).to(device) feature_extractor = esrgan_model.FeatureExtractor().to(device) feature_extractor.eval() # GAN losses criterion_GAN = torch.nn.BCEWithLogitsLoss().to(device) criterion_content = torch.nn.L1Loss().to(device) criterion_pixel = torch.nn.L1Loss().to(device) # GAN optimizers optimizer_G = torch.optim.Adam(generator.parameters(), lr=.0002, betas=(.9, .999)) optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=.0002, betas=(.9, .999)) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super(DefaultTrainer, self).__init__(model, data_loader, optimizer, discriminator, generator, feature_extractor, optimizer_G, optimizer_D, criterion_pixel, criterion_content, criterion_GAN) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = AdetCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def __init__(self, cfg): logger = logging.getLogger("detectron2") if not logger.isEnabledFor(logging.INFO): setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader, num_per_epoch = self.build_train_loader(cfg) # update iteration cfg to epoch cfg if cfg.SOLVER.EPOCH.ENABLED: cfg = self.adjust_epoch_to_iter(cfg, num_per_epoch) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super(DefaultTrainer, self).__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def trainer(args, visual_, visual_threshold): cfg = setup_origin_configs(args) print('cfg', ) regist_datasets(cfg) model = build_model(cfg) if visual_: # 不设置就是0.05 cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = visual_threshold cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, args.best_model_name) visual(cfg) return None logger.info("Model:\n{}".format(model)) if args.eval_only: # load pretrained model weights: cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, args.best_model_name) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) do_train(cfg, model) return do_test(cfg, model)
def benchmark_train(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) optimizer = build_optimizer(cfg, model) checkpointer = DetectionCheckpointer(model, optimizer=optimizer) checkpointer.load(cfg.MODEL.WEIGHTS) cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 2 data_loader = build_detection_train_loader(cfg) dummy_data = list(itertools.islice(data_loader, 100)) def f(): data = DatasetFromList(dummy_data, copy=False, serialize=False) while True: yield from data max_iter = 400 trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( model, f(), optimizer) trainer.register_hooks([ hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)]), hooks.TorchProfiler(lambda trainer: trainer.iter == max_iter - 1, cfg.OUTPUT_DIR, save_tensorboard=True), ]) trainer.train(1, max_iter)
def __init__(self, cfg): """ Args: cfg (CfgNode): """ # Assume these objects must be constructed in this order. # self.apply_mul_opts = True if cfg.MODEL.ROI_MASK_HEAD.RECON_NET.NAME != "" else False self.apply_mul_opts = False model = self.build_model(cfg) if self.apply_mul_opts: optimizer_main = self.build_optimizer(cfg, model, ty_opt="M") optimizer_recon = self.build_optimizer(cfg, model, ty_opt="A") optimizer = [optimizer_main, optimizer_recon] self.scheduler_main = self.build_lr_scheduler(cfg, optimizer_main) self.scheduler_recon = self.build_lr_scheduler( cfg, optimizer_recon) self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer_gen=optimizer_main, optimizer_dis=optimizer_recon, scheduler_gen=self.scheduler_main, scheduler_dis=self.scheduler_recon, ) else: optimizer = self.build_optimizer(cfg, model, ty_opt=cfg.SOLVER.OPT_TYPE) # optimizer = self.build_optimizer(cfg, model, ty_opt='SGD') self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) logger = logging.getLogger(__name__) logger.info("optimizer information:{}".format(type(optimizer))) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super().__init__(model, data_loader, optimizer, cfg) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def convert_coco_text_to_coco_detection_json( source_json: str, target_json: str, set_type: Optional[str] = None, min_img_size: int = 100, text_cat_id: int = 1, ) -> Dict: """ This function converts a COCOText style JSON to a COCODetection style JSON. For COCOText see: https://vision.cornell.edu/se3/coco-text-2/ For COCODetection see: http://cocodataset.org/#overview """ with PathManager.open(source_json, "r") as f: coco_text_json = json.load(f) coco_text_json["annotations"] = list(coco_text_json["anns"].values()) coco_text_json["images"] = list(coco_text_json["imgs"].values()) if set_type is not None: # COCO Text style JSONs often mix test, train, and val sets. # We need to make sure we only use the data type we want. coco_text_json["images"] = [ x for x in coco_text_json["images"] if x["set"] == set_type ] coco_text_json["categories"] = [{"name": "text", "id": text_cat_id}] del coco_text_json["cats"] del coco_text_json["imgs"] del coco_text_json["anns"] for ann in coco_text_json["annotations"]: ann["category_id"] = text_cat_id ann["iscrowd"] = 0 # Don't evaluate the model on illegible words if set_type == "val" and ann["legibility"] != "legible": ann["ignore"] = True # Some datasets seem to have extremely small images which break downstream # operations. If min_img_size is set, we can remove these. coco_text_json["images"] = [ x for x in coco_text_json["images"] if x["height"] >= min_img_size and x["width"] >= min_img_size ] # Remap image_ids if necessary if isinstance(coco_text_json["images"][0]["id"], str): image_id_remap = { x["id"]: id_no for (id_no, x) in enumerate(coco_text_json["images"]) } for x in coco_text_json["images"]: x["id"] = image_id_remap[x["id"]] for x in coco_text_json["annotations"]: if x["image_id"] in image_id_remap: x["image_id"] = image_id_remap[x["image_id"]] PathManager.mkdirs(os.path.dirname(target_json)) if comm.get_local_rank() == 0: with PathManager.open(target_json, "w") as f: json.dump(coco_text_json, f) return coco_text_json
def __init__(self, cfg): """ Args: cfg (CfgNode): """ logger = logging.getLogger("detectron2") if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 logger = setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. data_loader = self.build_train_loader(cfg) cfg = self.auto_scale_hyperparams(cfg, data_loader) model = self.build_model(cfg) # KD or not self.kd = cfg.MODEL.CENTERNET.KD.ENABLED self.model_t = None if self.kd: self.model_t = self.build_teacher_model(cfg) optimizer = self.build_optimizer(cfg, model) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) super(DefaultTrainer, self).__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 if cfg.SOLVER.SWA.ENABLED: self.max_iter = cfg.SOLVER.MAX_ITER + cfg.SOLVER.SWA.ITER else: self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.skip_loss = cfg.MODEL.CENTERNET.LOSS.SKIP_LOSS self.history_loss = 10e8 self.skip_weight = cfg.MODEL.CENTERNET.LOSS.SKIP_WEIGHT self.communism = cfg.MODEL.CENTERNET.LOSS.COMMUNISM.ENABLE self.communism_cls_loss = cfg.MODEL.CENTERNET.LOSS.COMMUNISM.CLS_LOSS self.communism_wh_loss = cfg.MODEL.CENTERNET.LOSS.COMMUNISM.WH_LOSS self.communism_off_loss = cfg.MODEL.CENTERNET.LOSS.COMMUNISM.OFF_LOSS self.register_hooks(self.build_hooks())
def main( cfg, output_dir, runner=None, eval_only=False, # NOTE: always enable resume when running on cluster resume=True, ): setup_after_launch(cfg, output_dir, runner) model = runner.build_model(cfg) logger.info("Model:\n{}".format(model)) if eval_only: checkpointer = runner.build_checkpointer(cfg, model, save_dir=output_dir) # checkpointer.resume_or_load() will skip all additional checkpointable # which may not be desired like ema states if resume and checkpointer.has_checkpoint(): checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume) else: checkpoint = checkpointer.load(cfg.MODEL.WEIGHTS) train_iter = checkpoint.get("iteration", None) model.eval() metrics = runner.do_test(cfg, model, train_iter=train_iter) print_metrics_table(metrics) return { "accuracy": metrics, "model_configs": {}, "metrics": metrics, } model = create_ddp_model( model, fp16_compression=cfg.MODEL.DDP_FP16_GRAD_COMPRESS, device_ids=None if cfg.MODEL.DEVICE == "cpu" else [comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=cfg.MODEL.DDP_FIND_UNUSED_PARAMETERS, ) trained_cfgs = runner.do_train(cfg, model, resume=resume) metrics = runner.do_test(cfg, model) print_metrics_table(metrics) # dump config files for trained models trained_model_configs = dump_trained_model_configs(cfg.OUTPUT_DIR, trained_cfgs) return { # for e2e_workflow "accuracy": metrics, # for unit_workflow "model_configs": trained_model_configs, "metrics": metrics, }
def cuda(self, device): self.model.to(torch.device(device)) logger = logging.getLogger(__name__) logger.info("Model:\n{}".format(self.model)) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: self.model = DistributedDataParallel( self.model, device_ids=[comm.get_local_rank()], broadcast_buffers=False )
def wrap_model_with_ddp(self, cfg, model): # work with PR: https://github.com/facebookresearch/detectron2/pull/1820 if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True ) return model
def __init__(self, cfg, parser, mapper_object, isShuffleData): """ Args: cfg (CfgNode): """ self.isShuffleData = isShuffleData self.mapper_object = mapper_object logger = logging.getLogger("detectron2") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg, self.mapper_object, self.isShuffleData) # If we're shuffling data, we're not doing curriculum learning if (isShuffleData): curr_data_loader = None # If we're NOT shuffling, then we're doing curriculum learning else: curr_data_loader = my_build_detection_train_loader( cfg, mapper=mapper_object.train_mapper, isShuffleData=isShuffleData, curriculum_fraction=0.3) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super().__init__(cfg, model, data_loader, curr_data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks()) if (parser.accuracy == 0): self.isTrackAccuracy = False else: self.isTrackAccuracy = True
def __init__(self, cfg): """ Args: cfg (CfgNode): Use the custom checkpointer, which loads other backbone models with matching heuristics. """ # Assume these objects must be constructed in this order. dprint("build model") model = self.build_model(cfg) dprint('build optimizer') optimizer = self.build_optimizer(cfg, model) dprint("build train loader") data_loader = self.build_train_loader(cfg) images_per_batch = cfg.SOLVER.IMS_PER_BATCH if isinstance(data_loader, AspectRatioGroupedDataset): dataset_len = len(data_loader.dataset.dataset) iters_per_epoch = dataset_len // images_per_batch else: dataset_len = len(data_loader.dataset) iters_per_epoch = dataset_len // images_per_batch self.iters_per_epoch = iters_per_epoch total_iters = cfg.SOLVER.TOTAL_EPOCHS * iters_per_epoch dprint("images_per_batch: ", images_per_batch) dprint("dataset length: ", dataset_len) dprint("iters per epoch: ", iters_per_epoch) dprint("total iters: ", total_iters) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super(DefaultTrainer, self).__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer, total_iters=total_iters) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = AdetCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = total_iters # NOTE: ignore cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def wrap_model_with_ddp(self, cfg, model): """ Returns: torch.nn.Module: Overwrite this function if you'd like to implement more with `torch.nn.parallel.DistributedDataParallel`, such as adding `find_unused_parameters=True`. """ if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) return model
def main(sm_args, world): cfg = _setup(sm_args) is_zero_rank = comm.get_local_rank() == 0 trainer = DefaultTrainer(cfg) resume = True if sm_args.resume == "True" else False trainer.resume_or_load(resume=resume) trainer.train() if world["is_master"] and is_zero_rank: _save_model()
def __init__(self, cfg): """ Args: cfg (CfgNode): """ super().__init__() logger = logging.getLogger("detectron2") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() # 通过torch.distributed 来确定实际使用gpu的数目 # 根据使用gpu的数目,自动调整参数设置 cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = self.build_model(cfg) # build optimizer, defualt is SGD, cfg.solver.optimizer is not used optimizer = self.build_optimizer(cfg, model) # 是不是应该用 self.optimizer = ..., # 这里没用是因为最后调用的训练器是SimpleTrainer(...) # this train_loader will return a list[dict] rather than a (tensor,labels) # which combine a batch data as other task data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True) # cfg.SOLVER.AMP.ENABLED default:False # 这个的意义在哪 self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(model, data_loader, optimizer) # default: warmupMultistep self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks # checkpointer.save() / .load() 保存和加载checkpoint self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())