def test(cfg, model, distributed): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.deprecated.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=1000, gamma=0.9) #make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, None, None, output_dir, save_to_disk # , optimizer, scheduler ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def train(cfg, local_rank, distributed, use_tensorboard=False, logger=None): arguments = {"iteration": 0} data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if cfg.SOLVER.UNFREEZE_CONV_BODY: for p in model.backbone.parameters(): p.requires_grad = True optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk, logger=logger) print(cfg.TRAIN.IGNORE_LIST) extra_checkpoint_data = checkpointer.load( cfg.MODEL.WEIGHT, ignore_list=cfg.TRAIN.IGNORE_LIST) arguments.update(extra_checkpoint_data) if cfg.SOLVER.KEEP_LR: optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD tensorboard_logdir = cfg.OUTPUT_DIR tensorboard_exp_name = cfg.TENSORBOARD_EXP_NAME snapshot = cfg.SOLVER.SNAPSHOT_ITERS do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, snapshot, tensorboard_logdir, tensorboard_exp_name, use_tensorboard=use_tensorboard) return model
def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg.DATASETS.TEST for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] # 计算损失,model会计算images与targets的损失,并返回 # 实际调用的是GeneralizedRCNN类的forward方法 loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes # 多GPU的时候,会有无效损失,去除这些损失 loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe # amp的用法 with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() # scheduler也需要step scheduler.step() # 每个batch要多久 batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) # 还需要多久 eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) # 每20次迭代或者最好一次迭代了,输出信息 if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, # 还需要多久 iter= iteration, # 第几次迭代,是从maskrcnn_benchmark.utils.checkpoint记录的迭代次数开始算起 meters=str( meters ), # 包含loss、loss_classifier、loss_box_reg、loss_objectness、loss_rpn_box_reg lr=optimizer.param_groups[0]["lr"], # 学习率 memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, # 占用最大的GPU内存 )) # 每隔checkpoint_period保存一次checkpoint if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) # 最后一次迭代的时候保存checkpoint if iteration == max_iter: checkpointer.save("model_final", **arguments) # train总共花了多少时间 total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "--json-file", default="", metavar="FILE", help="path to prediction bbox json file", ) # parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 # distributed = num_gpus > 1 # if distributed: # torch.cuda.set_device(args.local_rank) # torch.distributed.init_process_group( # backend="nccl", init_method="env://" # ) # synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) # logger.info("Using {} GPUs".format(num_gpus)) # logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) # model = build_detection_model(cfg) # model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR # checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) # _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) # if cfg.MODEL.MASK_ON: # iou_types = iou_types + ("segm",) # if cfg.MODEL.KEYPOINT_ON: # iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=False) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): # inference( # model, # data_loader_val, # dataset_name=dataset_name, # iou_types=iou_types, # box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, # device=cfg.MODEL.DEVICE, # expected_results=cfg.TEST.EXPECTED_RESULTS, # expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, # output_folder=output_folder, # ) # extra_args = dict( # box_only=False, # iou_types=iou_types, # expected_results=cfg.TEST.EXPECTED_RESULTS, # expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, # ) dataset = data_loader_val.dataset # evaluate(dataset=dataset, # predictions=predictions, # output_folder=output_folder, # only_human=True, # **extra_args) do_coco_json_evaluation( dataset=dataset, json_file=args.json_file, box_only=False, output_folder=output_folder, iou_types=iou_types, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL) synchronize()
def train(cfg, local_rank, distributed, logger): if is_main_process(): wandb.init(project='scene-graph', entity='sgg-speaker-listener', config=cfg.LISTENER) debug_print(logger, 'prepare training') model = build_detection_model(cfg) listener = build_listener(cfg) speaker_listener = SpeakerListener(model, listener, cfg, is_joint=cfg.LISTENER.JOINT) if is_main_process(): wandb.watch(listener) debug_print(logger, 'end model construction') # modules that should be always set in eval mode # their eval() method should be called after model.train() is called eval_modules = ( model.rpn, model.backbone, model.roi_heads.box, ) fix_eval_modules(eval_modules) # NOTE, we slow down the LR of the layers start with the names in slow_heads if cfg.MODEL.ROI_RELATION_HEAD.PREDICTOR == "IMPPredictor": slow_heads = [ "roi_heads.relation.box_feature_extractor", "roi_heads.relation.union_feature_extractor.feature_extractor", ] else: slow_heads = [] # load pretrain layers to new layers load_mapping = { "roi_heads.relation.box_feature_extractor": "roi_heads.box.feature_extractor", "roi_heads.relation.union_feature_extractor.feature_extractor": "roi_heads.box.feature_extractor" } if cfg.MODEL.ATTRIBUTE_ON: load_mapping[ "roi_heads.relation.att_feature_extractor"] = "roi_heads.attribute.feature_extractor" load_mapping[ "roi_heads.relation.union_feature_extractor.att_feature_extractor"] = "roi_heads.attribute.feature_extractor" device = torch.device(cfg.MODEL.DEVICE) model.to(device) listener.to(device) num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_batch = cfg.SOLVER.IMS_PER_BATCH optimizer = make_optimizer(cfg, model, logger, slow_heads=slow_heads, slow_ratio=10.0, rl_factor=float(num_batch)) listener_optimizer = make_listener_optimizer(cfg, listener) scheduler = make_lr_scheduler(cfg, optimizer, logger) listener_scheduler = None debug_print(logger, 'end optimizer and schedule') if cfg.LISTENER.JOINT: speaker_listener_optimizer = make_speaker_listener_optimizer( cfg, speaker_listener.speaker, speaker_listener.listener) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' if cfg.LISTENER.JOINT: speaker_listener, speaker_listener_optimizer = amp.initialize( speaker_listener, speaker_listener_optimizer, opt_level='O0') else: speaker_listener, listener_optimizer = amp.initialize( speaker_listener, listener_optimizer, opt_level='O0') #listener, listener_optimizer = amp.initialize(listener, listener_optimizer, opt_level='O0') #[model, listener], [optimizer, listener_optimizer] = amp.initialize([model, listener], [optimizer, listener_optimizer], opt_level='O1', loss_scale=1) #model = amp.initialize(model, opt_level='O1') if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True, ) listener = torch.nn.parallel.DistributedDataParallel( listener, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True, ) debug_print(logger, 'end distributed') arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR listener_dir = cfg.LISTENER_DIR save_to_disk = get_rank() == 0 speaker_checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk, custom_scheduler=True) listener_checkpointer = Checkpointer(listener, optimizer=listener_optimizer, save_dir=listener_dir, save_to_disk=save_to_disk, custom_scheduler=False) speaker_listener.add_listener_checkpointer(listener_checkpointer) speaker_listener.add_speaker_checkpointer(speaker_checkpointer) speaker_listener.load_listener() speaker_listener.load_speaker(load_mapping=load_mapping) debug_print(logger, 'end load checkpointer') train_data_loader = make_data_loader(cfg, mode='train', is_distributed=distributed, start_iter=arguments["iteration"], ret_images=True) val_data_loaders = make_data_loader(cfg, mode='val', is_distributed=distributed, ret_images=True) debug_print(logger, 'end dataloader') checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if cfg.SOLVER.PRE_VAL: logger.info("Validate before training") #output = run_val(cfg, model, listener, val_data_loaders, distributed, logger) #print('OUTPUT: ', output) #(sg_loss, img_loss, sg_acc, img_acc) = output logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(train_data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() print_first_grad = True listener_loss_func = torch.nn.MarginRankingLoss(margin=1, reduction='none') mistake_saver = None if is_main_process(): ds_catalog = DatasetCatalog() dict_file_path = os.path.join( ds_catalog.DATA_DIR, ds_catalog.DATASETS['VG_stanford_filtered_with_attribute'] ['dict_file']) ind_to_classes, ind_to_predicates = load_vg_info(dict_file_path) ind_to_classes = {k: v for k, v in enumerate(ind_to_classes)} ind_to_predicates = {k: v for k, v in enumerate(ind_to_predicates)} print('ind to classes:', ind_to_classes, '/n ind to predicates:', ind_to_predicates) mistake_saver = MistakeSaver( '/Scene-Graph-Benchmark.pytorch/filenames_masked', ind_to_classes, ind_to_predicates) #is_printed = False while True: try: listener_iteration = 0 for iteration, (images, targets, image_ids) in enumerate(train_data_loader, start_iter): if cfg.LISTENER.JOINT: speaker_listener_optimizer.zero_grad() else: listener_optimizer.zero_grad() #print(f'ITERATION NUMBER: {iteration}') if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) if len(images) <= 1: continue data_time = time.time() - end iteration = iteration + 1 listener_iteration += 1 arguments["iteration"] = iteration model.train() fix_eval_modules(eval_modules) images_list = deepcopy(images) images_list = to_image_list( images_list, cfg.DATALOADER.SIZE_DIVISIBILITY).to(device) for i in range(len(images)): images[i] = images[i].unsqueeze(0) images[i] = F.interpolate(images[i], size=(224, 224), mode='bilinear', align_corners=False) images[i] = images[i].squeeze() images = torch.stack(images).to(device) #images.requires_grad_() targets = [target.to(device) for target in targets] speaker_loss_dict = {} if not cfg.LISTENER.JOINT: score_matrix = speaker_listener(images_list, targets, images) else: score_matrix, _, speaker_loss_dict = speaker_listener( images_list, targets, images) speaker_summed_losses = sum( loss for loss in speaker_loss_dict.values()) # reduce losses over all GPUs for logging purposes if not not cfg.LISTENER.JOINT: speaker_loss_dict_reduced = reduce_loss_dict( speaker_loss_dict) speaker_losses_reduced = sum( loss for loss in speaker_loss_dict_reduced.values()) speaker_losses_reduced /= num_gpus if is_main_process(): wandb.log( {"Train Speaker Loss": speaker_losses_reduced}, listener_iteration) listener_loss = 0 gap_reward = 0 avg_acc = 0 num_correct = 0 score_matrix = score_matrix.to(device) # fill loss matrix loss_matrix = torch.zeros((2, images.size(0), images.size(0)), device=device) # sg centered scores for true_index in range(loss_matrix.size(1)): row_score = score_matrix[true_index] (true_scores, predicted_scores, binary) = format_scores(row_score, true_index, device) loss_vec = listener_loss_func(true_scores, predicted_scores, binary) loss_matrix[0][true_index] = loss_vec # image centered scores transposted_score_matrix = score_matrix.t() for true_index in range(loss_matrix.size(1)): row_score = transposted_score_matrix[true_index] (true_scores, predicted_scores, binary) = format_scores(row_score, true_index, device) loss_vec = listener_loss_func(true_scores, predicted_scores, binary) loss_matrix[1][true_index] = loss_vec print('iteration:', listener_iteration) sg_acc = 0 img_acc = 0 # calculate accuracy for i in range(loss_matrix.size(1)): temp_sg_acc = 0 temp_img_acc = 0 for j in range(loss_matrix.size(2)): if loss_matrix[0][i][i] > loss_matrix[0][i][j]: temp_sg_acc += 1 else: if cfg.LISTENER.HTML: if is_main_process( ) and listener_iteration >= 600 and listener_iteration % 25 == 0 and i != j: detached_sg_i = (sgs[i][0].detach(), sgs[i][1], sgs[i][2].detach()) detached_sg_j = (sgs[j][0].detach(), sgs[j][1], sgs[j][2].detach()) mistake_saver.add_mistake( (image_ids[i], image_ids[j]), (detached_sg_i, detached_sg_j), listener_iteration, 'SG') if loss_matrix[1][i][i] > loss_matrix[1][j][i]: temp_img_acc += 1 else: if cfg.LISTENER.HTML: if is_main_process( ) and listener_iteration >= 600 and listener_iteration % 25 == 0 and i != j: detached_sg_i = (sgs[i][0].detach(), sgs[i][1], sgs[i][2].detach()) detached_sg_j = (sgs[j][0].detach(), sgs[j][1], sgs[j][2].detach()) mistake_saver.add_mistake( (image_ids[i], image_ids[j]), (detached_sg_i, detached_sg_j), listener_iteration, 'IMG') temp_sg_acc = temp_sg_acc * 100 / (loss_matrix.size(1) - 1) temp_img_acc = temp_img_acc * 100 / (loss_matrix.size(1) - 1) sg_acc += temp_sg_acc img_acc += temp_img_acc if cfg.LISTENER.HTML: if is_main_process( ) and listener_iteration % 100 == 0 and listener_iteration >= 600: mistake_saver.toHtml('/www') sg_acc /= loss_matrix.size(1) img_acc /= loss_matrix.size(1) avg_sg_acc = torch.tensor([sg_acc]).to(device) avg_img_acc = torch.tensor([img_acc]).to(device) # reduce acc over all gpus avg_acc = {'sg_acc': avg_sg_acc, 'img_acc': avg_img_acc} avg_acc_reduced = reduce_loss_dict(avg_acc) sg_acc = sum(acc for acc in avg_acc_reduced['sg_acc']) img_acc = sum(acc for acc in avg_acc_reduced['img_acc']) # log acc to wadb if is_main_process(): wandb.log({ "Train SG Accuracy": sg_acc.item(), "Train IMG Accuracy": img_acc.item() }) sg_loss = 0 img_loss = 0 for i in range(loss_matrix.size(0)): for j in range(loss_matrix.size(1)): loss_matrix[i][j][j] = 0. for i in range(loss_matrix.size(1)): sg_loss += torch.max(loss_matrix[0][i]) img_loss += torch.max(loss_matrix[1][:][i]) sg_loss = sg_loss / loss_matrix.size(1) img_loss = img_loss / loss_matrix.size(1) sg_loss = sg_loss.to(device) img_loss = img_loss.to(device) loss_dict = {'sg_loss': sg_loss, 'img_loss': img_loss} losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) sg_loss_reduced = loss_dict_reduced['sg_loss'] img_loss_reduced = loss_dict_reduced['img_loss'] if is_main_process(): wandb.log({"Train SG Loss": sg_loss_reduced}) wandb.log({"Train IMG Loss": img_loss_reduced}) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) losses = losses + speaker_summed_losses * cfg.LISTENER.LOSS_COEF # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe #losses.backward() if not cfg.LISTENER.JOINT: with amp.scale_loss(losses, listener_optimizer) as scaled_losses: scaled_losses.backward() else: with amp.scale_loss( losses, speaker_listener_optimizer) as scaled_losses: scaled_losses.backward() verbose = (iteration % cfg.SOLVER.PRINT_GRAD_FREQ ) == 0 or print_first_grad # print grad or not print_first_grad = False #clip_grad_value([(n, p) for n, p in listener.named_parameters() if p.requires_grad], cfg.LISTENER.CLIP_VALUE, logger=logger, verbose=True, clip=True) if not cfg.LISTENER.JOINT: listener_optimizer.step() else: speaker_listener_optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if cfg.LISTENER.JOINT: if iteration % 200 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=speaker_listener_optimizer.param_groups[-1] ["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) else: if iteration % 200 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=listener_optimizer.param_groups[-1]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: """ print('Model before save') print('****************************') print(listener.gnn.conv1.node_model.node_mlp_1[0].weight) print('****************************') """ if not cfg.LISTENER.JOINT: listener_checkpointer.save( "model_{:07d}".format(listener_iteration), amp=amp.state_dict()) else: speaker_checkpointer.save( "model_speaker{:07d}".format(iteration)) listener_checkpointer.save( "model_listenr{:07d}".format(listener_iteration), amp=amp.state_dict()) if iteration == max_iter: if not cfg.LISTENER.JOINT: listener_checkpointer.save( "model_{:07d}".format(listener_iteration), amp=amp.state_dict()) else: speaker_checkpointer.save( "model_{:07d}".format(iteration)) listener_checkpointer.save( "model_{:07d}".format(listener_iteration), amp=amp.state_dict()) val_result = None # used for scheduler updating if cfg.SOLVER.TO_VAL and iteration % cfg.SOLVER.VAL_PERIOD == 0: logger.info("Start validating") val_result = run_val(cfg, model, listener, val_data_loaders, distributed, logger) (sg_loss, img_loss, sg_acc, img_acc, speaker_val) = val_result if is_main_process(): wandb.log({ "Validation SG Accuracy": sg_acc, "Validation IMG Accuracy": img_acc, "Validation SG Loss": sg_loss, "Validation IMG Loss": img_loss, "Validation Speaker": speaker_val, }) #logger.info("Validation Result: %.4f" % val_result) except Exception as err: raise (err) print('Dataset finished, creating new') train_data_loader = make_data_loader( cfg, mode='train', is_distributed=distributed, start_iter=arguments["iteration"], ret_images=True) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter))) return listener
def do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, writer, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) dataset_names = cfg.DATASETS.TEST for iteration, (images, targets, _) in enumerate(data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) continue data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # # Add images every 100 iterations if iteration % 100 == 0: # # Display images # image = images.tensors[0].cpu().numpy() # means = np.zeros((image.shape[0], image.shape[1], image.shape[2])) # means[0] = 102.9801 # means[1] = 115.9465 # means[2] = 122.7717 # image = image + means # image = image[[2, 1, 0]].astype(np.uint8) # writer.add_image('input image', image, iteration) # for b in range(len(targets[0].bbox)): # box = targets[0].bbox[b] # x1 = np.around(box[0].cpu().numpy()) # y1 = np.around(box[1].cpu().numpy()) # x2 = np.around(box[2].cpu().numpy()) # y2 = np.around(box[3].cpu().numpy()) # rr, cc = rectangle_perimeter(y1, x1, y2-y1, x2-x1) # image[:, rr, cc] = 255 # writer.add_image('target boxes', image, iteration) # # Display masks # masks = targets[0].get_field('masks')[0] # masks = masks.get_mask_tensor() # combined_mask = masks[0, :, :] # for i in range(1,8): # combined_mask = combined_mask | masks[i, :, :] # writer.add_image('mask', combined_mask.unsqueeze(0)*255, iteration) # writer.add_image('single part 2', masks[1, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 3', masks[2, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 4', masks[3, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 5', masks[4, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 6', masks[5, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 7', masks[6, :, :].unsqueeze(0)*255, iteration) # writer.add_image('single part 8', masks[7, :, :].unsqueeze(0)*255, iteration) # Display Losses writer.add_scalar('loss', meters.loss.median, iteration) writer.add_scalar('loss_classifier', loss_dict_reduced['loss_classifier'].item(), iteration) writer.add_scalar('loss_box_reg', loss_dict_reduced['loss_box_reg'].item(), iteration) writer.add_scalar('loss_objectness', loss_dict_reduced['loss_objectness'].item(), iteration) writer.add_scalar('loss_rpn_box_reg', loss_dict_reduced['loss_rpn_box_reg'].item(), iteration) writer.add_scalar('loss_mask', loss_dict_reduced['loss_mask'].item(), iteration) writer.add_scalar('loss_kpt', loss_dict_reduced['loss_kp'].item(), iteration) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], iteration) # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def train(cfg, local_rank, distributed): # Model logging print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH) print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value = cfg.INPUT.PIXEL_MEAN) print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD) print_mlperf(key=mlperf_log.INPUT_RESIZE) print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING) print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN) print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN) print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP) print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST) print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS) print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY) print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH) # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/modeling/detector/detectors.py # building bare mode without doing anthing model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) # Optimizer logging print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM) print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR) print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM) print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR print("output_dir "+str(output_dir)) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) # no such SAVE_CHECKPOINTS #arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS arguments["save_checkpoints"] = False extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"] ) print("SSY iters_per_epoch "+str(iters_per_epoch)) #print("SSY iters_per_epoch change to 100 ") #iters_per_epoch = 100 checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch # SSY # I already add PER_EPOCH_EVAL and MIN_BBOX_MAP MIN_SEGM_MAP to ./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml # but it still can not find it # so I manually set them here #if cfg.PER_EPOCH_EVAL: # per_iter_callback_fn = functools.partial( # mlperf_test_early_exit, # iters_per_epoch=iters_per_epoch, # tester=functools.partial(test, cfg=cfg), # model=model, # distributed=distributed, # min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP, # min_segm_map=cfg.MLPERF.MIN_SEGM_MAP) #else: # per_iter_callback_fn = None per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/tester.py tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=0.377, min_segm_map=0.339) start_train_time = time.time() # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/trainer.py do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch), per_iter_end_callback_fn=per_iter_callback_fn, ) end_train_time = time.time() total_training_time = end_train_time - start_train_time print( "&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format((arguments["iteration"] * 1.0) / total_training_time) ) return model
def train(cfg, local_rank, distributed, logger): debug_print(logger, 'prepare training') model = build_detection_model(cfg) debug_print(logger, 'end model construction') # modules that should be always set in eval mode # their eval() method should be called after model.train() is called eval_modules = (model.rpn, model.backbone, model.roi_heads.box,) fix_eval_modules(eval_modules) # NOTE, we slow down the LR of the layers start with the names in slow_heads if cfg.MODEL.ROI_RELATION_HEAD.PREDICTOR == "IMPPredictor": slow_heads = ["roi_heads.relation.box_feature_extractor", "roi_heads.relation.union_feature_extractor.feature_extractor",] else: slow_heads = [] # load pretrain layers to new layers load_mapping = {"roi_heads.relation.box_feature_extractor" : "roi_heads.box.feature_extractor", "roi_heads.relation.union_feature_extractor.feature_extractor" : "roi_heads.box.feature_extractor"} if cfg.MODEL.ATTRIBUTE_ON: load_mapping["roi_heads.relation.att_feature_extractor"] = "roi_heads.attribute.feature_extractor" load_mapping["roi_heads.relation.union_feature_extractor.att_feature_extractor"] = "roi_heads.attribute.feature_extractor" device = torch.device(cfg.MODEL.DEVICE) model.to(device) num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 num_batch = cfg.SOLVER.IMS_PER_BATCH optimizer = make_optimizer(cfg, model, logger, slow_heads=slow_heads, slow_ratio=10.0, rl_factor=float(num_batch)) scheduler = make_lr_scheduler(cfg, optimizer, logger) debug_print(logger, 'end optimizer and shcedule') # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True, ) debug_print(logger, 'end distributed') arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk, custom_scheduler=True ) # if there is certain checkpoint in output_dir, load it, else load pretrained detector if checkpointer.has_checkpoint(): extra_checkpoint_data = checkpointer.load(cfg.MODEL.PRETRAINED_DETECTOR_CKPT, update_schedule=cfg.SOLVER.UPDATE_SCHEDULE_DURING_LOAD) arguments.update(extra_checkpoint_data) if cfg.SOLVER.UPDATE_SCHEDULE_DURING_LOAD: checkpointer.scheduler.last_epoch = extra_checkpoint_data["iteration"] logger.info("update last epoch of scheduler to iter: {}".format(str(extra_checkpoint_data["iteration"]))) else: # load_mapping is only used when we init current model from detection model. checkpointer.load(cfg.MODEL.PRETRAINED_DETECTOR_CKPT, with_optim=False, load_mapping=load_mapping) debug_print(logger, 'end load checkpointer') train_data_loader = make_data_loader( cfg, mode='train', is_distributed=distributed, start_iter=arguments["iteration"], ) val_data_loaders = make_data_loader( cfg, mode='val', is_distributed=distributed, ) debug_print(logger, 'end dataloader') checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if cfg.SOLVER.PRE_VAL: logger.info("Validate before training") run_val(cfg, model, val_data_loaders, distributed, logger) logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(train_data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() print_first_grad = True for iteration, (images, targets, _) in enumerate(train_data_loader, start_iter): if any(len(target) < 1 for target in targets): logger.error(f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration model.train() fix_eval_modules(eval_modules) images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() # add clip_grad_norm from MOTIFS, tracking gradient, used for debug verbose = (iteration % cfg.SOLVER.PRINT_GRAD_FREQ) == 0 or print_first_grad # print grad or not print_first_grad = False clip_grad_norm([(n, p) for n, p in model.named_parameters() if p.requires_grad], max_norm=cfg.SOLVER.GRAD_NORM_CLIP, logger=logger, verbose=verbose, clip=True) optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 200 == 0 or iteration == max_iter: logger.info( meters.delimiter.join( [ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ] ).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[-1]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, ) ) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) val_result = None # used for scheduler updating if cfg.SOLVER.TO_VAL and iteration % cfg.SOLVER.VAL_PERIOD == 0: logger.info("Start validating") val_result = run_val(cfg, model, val_data_loaders, distributed, logger) logger.info("Validation Result: %.4f" % val_result) # scheduler should be called after optimizer.step() in pytorch>=1.1.0 # https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate if cfg.SOLVER.SCHEDULE.TYPE == "WarmupReduceLROnPlateau": scheduler.step(val_result, epoch=iteration) if scheduler.stage_count >= cfg.SOLVER.SCHEDULE.MAX_DECAY_STEP: logger.info("Trigger MAX_DECAY_STEP at iteration {}.".format(iteration)) break else: scheduler.step() total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter) ) ) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 arguments['phase'] = 1 arguments['plot_median'], arguments['plot_global_avg'] = defaultdict( list), defaultdict(list) output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if arguments['phase'] == 1: data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], phase=1, ) do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, training_phase=1, ) arguments["iteration"] = 0 arguments["phase"] = 2 data_loader_phase2 = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], phase=2, ) do_train( cfg, model, data_loader_phase2, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, training_phase=2, ) return model
def pred_with_weight(args): num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.deprecated.init_process_group(backend="nccl", init_method="env://") cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) for weight in args.weights: # skipping evaluations already performed out_json = "{}/detections/{}.json".format( cfg.OUTPUT_DIR, weight.split('/')[-1].split('_')[-1].split('.')[0]) if os.path.exists(out_json): print('skipping', out_json) continue checkpointer = DetectronCheckpointer(cfg, model) _ = checkpointer.load(weight) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) output_folders = [None] * len(cfg.DATASETS.TEST) if cfg.OUTPUT_DIR: dataset_names = cfg.DATASETS.TEST for idx, dataset_name in enumerate(dataset_names): output_folder = cfg.OUTPUT_DIR mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, data_loader_val in zip(output_folders, data_loaders_val): inference( model, data_loader_val, iou_types=iou_types, box_only=cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, name=weight.split('_')[-1].split('.')[0]) synchronize()
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR summary_writer = SummaryWriter(log_dir=output_dir) save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) if cfg.MODEL.WEIGHT.upper() == 'CONTINUE': model_weight = last_checkpoint(output_dir) else: model_weight = cfg.MODEL.WEIGHT extra_checkpoint_data = checkpointer.load(model_weight) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) data_loader_val = make_data_loader( cfg, is_train=False, is_distributed=distributed)[0] checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model=model, data_loader=data_loader, data_loader_val=data_loader_val, optimizer=optimizer, scheduler=scheduler, checkpointer=checkpointer, device=device, checkpoint_period=checkpoint_period, arguments=arguments, summary_writer=summary_writer ) return model
output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) results = [] for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): result = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, skip_eval=args.skip_eval, dllogger=dllogger,
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--data-dir", default=".", metavar="DIR", help="data dir for training", type=str, ) parser.add_argument( "--out-dir", default=".", metavar="DIR", help="output dir for model", type=str, ) parser.add_argument( "--gpu_ids", default="-1", help="gpu id", type=str, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() if args.gpu_ids != '-1': os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.MODEL.WEIGHT = args.data_dir + cfg.MODEL.WEIGHT[1:] cfg.DATA_DIR = args.data_dir + cfg.DATA_DIR[1:] cfg.OUTPUT_DIR = args.out_dir + cfg.OUTPUT_DIR[1:] print("cfg.OUTPUT_DIR: ", cfg.OUTPUT_DIR) print("cfg.MODEL.WEIGHT: ", cfg.MODEL.WEIGHT) print("cfg.DATA_DIR: ", cfg.DATA_DIR) print("cfg.MODEL.ATTRIBUTE_ON: ", cfg.MODEL.ATTRIBUTE_ON) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder # evaluate object detection data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): result_obj = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, eval_attributes=False, ) synchronize() # evaluate attribute detection data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): result_attr = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, eval_attributes=True, ) synchronize() # evaluate RPN cfg.defrost() cfg.MODEL.RPN_ONLY = True cfg.freeze() logger.info(cfg) # pdb.set_trace() model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): result_rpn = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, eval_attributes=False, ) synchronize() if is_main_process(): results = {**result_rpn, **result_obj, **result_attr} print(results)
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) params = get_model_parameters_number(model) print('{:<30} {:<8}'.format('Number of parameters: ', params)) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if cfg.MODEL.DOMAIN_ADAPTATION_ON: source_data_loader = make_data_loader( cfg, is_train=True, is_source=True, is_distributed=distributed, start_iter=arguments["iteration"], ) target_data_loader = make_data_loader( cfg, is_train=True, is_source=False, is_distributed=distributed, start_iter=arguments["iteration"], ) do_da_train( model, source_data_loader, target_data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, cfg, ) else: data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="/home/guli/Desktop/VOS_ICCV2019/maskrcnn-benchmark/configs/davis/e2e_mask_rcnn_R_50_FPN_1x_davis.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.deprecated.init_process_group( backend="nccl", init_method="env://" ) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("DAVIS_MaskRCNN_baseline_test", save_dir, args.local_rank) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) checkpointer = Checkpointer(model) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox",) # if cfg.MODEL.MASK_ON: # iou_types = iou_types + ("segm",) output_folders = [None] * len(cfg.DATASETS.TEST) if cfg.OUTPUT_DIR: dataset_names = cfg.DATASETS.TEST exp_name = cfg.EXP.NAME for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name + "_" + exp_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, data_loader_val in zip(output_folders, data_loaders_val): inference_davis( model, data_loader_val, iou_types=iou_types, box_only=False, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, debug=cfg.TEST.DEBUG, generate_annotation=cfg.TEST.GENERATE_ANNOTATION, overlay_box=cfg.TEST.OVERLAY_BOX, matching=cfg.TEST.MATCHING, skip_computation_network=cfg.TEST.SKIP_NETWORK, select_top_predictions_flag=cfg.TEST.SELECT_TOP_PREDICTIONS, cfg=cfg ) synchronize()
def train(cfg, local_rank, distributed, logger): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model, logger, rl_factor=float(cfg.SOLVER.IMS_PER_BATCH)) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load( cfg.MODEL.WEIGHT, update_schedule=cfg.SOLVER.UPDATE_SCHEDULE_DURING_LOAD) arguments.update(extra_checkpoint_data) train_data_loader = make_data_loader( cfg, mode='train', is_distributed=distributed, start_iter=arguments["iteration"], ) val_data_loaders = make_data_loader( cfg, mode='val', is_distributed=distributed, ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD if cfg.SOLVER.PRE_VAL: logger.info("Validate before training") run_val(cfg, model, val_data_loaders, distributed) logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(train_data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(train_data_loader, start_iter): model.train() if any(len(target) < 1 for target in targets): logger.error( f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" ) data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 200 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if cfg.SOLVER.TO_VAL and iteration % cfg.SOLVER.VAL_PERIOD == 0: logger.info("Start validating") run_val(cfg, model, val_data_loaders, distributed) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter))) return model
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # if use_amp: # # Initialize mixed-precision training # use_mixed_precision = cfg.DTYPE == "float16" # amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) # # wrap the optimizer for mixed precision # if cfg.SOLVER.ACCUMULATE_GRAD: # # also specify number of steps to accumulate over # optimizer = amp_handle.wrap_optimizer(optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS) # else: # optimizer = amp_handle.wrap_optimizer(optimizer) model, optimizer = amp.initialize(model, optimizer,opt_level='O1') if distributed: if use_apex_ddp: model = DDP(model, delay_allreduce=True) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer( cfg, model, optimizer, scheduler, output_dir, save_to_disk ) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader, iters_per_epoch = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD # set the callback function to evaluate and potentially # early exit each epoch if 1==1: per_iter_callback_fn = functools.partial( mlperf_test_early_exit, iters_per_epoch=iters_per_epoch, tester=functools.partial(test, cfg=cfg), model=model, distributed=distributed, min_bbox_map=cfg.MIN_BBOX_MAP, min_segm_map=cfg.MIN_MASK_MAP) else: per_iter_callback_fn = None do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, use_amp, cfg, per_iter_end_callback_fn=per_iter_callback_fn, ) return model
def test_maskscoring_rcnn(config_file): import sys sys.path.append('./detection_model/maskscoring_rcnn') import argparse import os os.environ['CUDA_VISIBLE_DEVICES'] = '2' import torch #from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.data import make_data_loader from maskrcnn_benchmark.engine.inference import inference from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.utils.collect_env import collect_env_info from maskrcnn_benchmark.utils.comm import synchronize, get_rank from maskrcnn_benchmark.utils.logger import setup_logger from maskrcnn_benchmark.utils.miscellaneous import mkdir from yacs.config import CfgNode as CN def read_config_file(config_file): """ read config information form yaml file """ f = open(config_file) opt = CN.load_cfg(f) return opt opt = read_config_file(config_file) num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(opt.local_rank) torch.distributed.deprecated.init_process_group( backend="nccl", init_method="env://" ) save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(opt) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(opt) model.to(opt.MODEL.DEVICE) output_dir = opt.OUTPUT_DIR checkpointer = DetectronCheckpointer(opt, model, save_dir=output_dir) _ = checkpointer.load(opt.MODEL.WEIGHT) iou_types = ("bbox",) if opt.MODEL.MASK_ON: iou_types = iou_types + ("segm",) output_folders = [None] * len(opt.DATASETS.TEST) if opt.OUTPUT_DIR: dataset_names = opt.DATASETS.TEST for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(opt.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(opt, is_train=False, is_distributed=distributed) for output_folder, data_loader_val in zip(output_folders, data_loaders_val): inference( model, data_loader_val, iou_types=iou_types, box_only=opt.MODEL.RPN_ONLY, device=opt.MODEL.DEVICE, expected_results=opt.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=opt.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, maskiou_on=opt.MODEL.MASKIOU_ON ) synchronize()
def train(cfg, cfg_origial, local_rank, distributed): ## The one with modified number of classes model = build_detection_model(cfg) # cfg_origial = cfg.clone() # cfg_origial.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81 # original_model = build_detection_model(cfg_origial) ## Original model with 81 classes # ## Let's load weights for old class! # save_dir = cfg.OUTPUT_DIR # checkpointer = DetectronCheckpointer(cfg_origial, original_model, save_dir=save_dir) # checkpointer.load(cfg_origial.MODEL.WEIGHT) # # pretrained_model_pth = "/network/home/bhattdha/.torch/models/_detectron_35861795_12_2017_baselines_e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT_output_train_coco_2014_train%3Acoco_2014_valminusminival_generalized_rcnn_model_final.pkl" # # These keys are to be removed which forms final layers of the network # removal_keys = ['roi_heads.box.predictor.cls_score.weight', 'roi_heads.box.predictor.cls_score.bias', 'roi_heads.box.predictor.bbox_pred.weight', 'roi_heads.box.predictor.bbox_pred.bias', 'roi_heads.mask.predictor.mask_fcn_logits.weight', 'roi_heads.mask.predictor.mask_fcn_logits.bias'] # model = _transfer_pretrained_weights(new_model, original_model, removal_keys) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # # Initialize mixed-precision training # use_mixed_precision = cfg.DTYPE == "float16" # amp_opt_level = 'O1' if use_mixed_precision else 'O0' # model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) # cfg.MODEL.WEIGHT = '/network/home/bhattdha/exp.pth' ## Model stored through surgery extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, ) return model
def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--seq_test", action='store_true') parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = cfg.OUTPUT_DIR logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank(), filename='test_log.txt') logger.info("Using {} GPUs".format(num_gpus)) # logger.info(cfg) logger.info("Collecting env info (might take some time)") # logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) ori_output_dir = cfg.OUTPUT_DIR if args.seq_test: load_dir = cfg.TEST.LOAD_DIR model_files = glob.glob(load_dir+'/*.pth') model_files.sort() min_iter = cfg.TEST.MIN_ITER max_iter = cfg.TEST.MAX_ITER # print(model_files) model_files = [model_file for model_file in model_files if 'final' not in model_file and int(model_file[-11:-4])>=min_iter and int(model_file[-11:-4])<=max_iter] else: model_files = [cfg.MODEL.WEIGHT] for model_file in model_files: cfg.defrost() cfg.MODEL.WEIGHT = model_file logger.info('testing from {} '.format(model_file)) cfg.OUTPUT_DIR = os.path.join(ori_output_dir, model_file[-11:-4]) cfg.freeze() output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False, #False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, stop_iter=cfg.FEW_SHOT.STOP_ITER ) synchronize()
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--build-model", default="", metavar="FILE", help="path to NAS model build file", type=str, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) assert len(args.build_model) != 0, 'args.build_model should be provided' model_config = json.load(open(args.build_model, 'r')) if isinstance(model_config, list): assert len(model_config) == 1 model_config = model_config[0] print('Testing single model:', model_config) model = build_detection_model(cfg, model_config) model.to(cfg.MODEL.DEVICE) # Initialize mixed-precision if necessary use_mixed_precision = cfg.DTYPE == 'float16' amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt _ = checkpointer.load(ckpt, use_latest=args.ckpt is None) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) # output_folders = [None] * len(cfg.DATASETS.TEST) # dataset_names = cfg.DATASETS.TEST dataset_names = cfg.DATASETS.NAS_VAL if not cfg.NAS.TRAIN_SINGLE_MODEL else cfg.DATASETS.TEST output_folders = [None] * len(dataset_names) if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, test_only=cfg.TEST_ONLY) if cfg.NAS.TRAIN_SINGLE_MODEL: if get_rank() == 0: print('==' * 20, 'Evaluating single model...', '==' * 20) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, bbox_aug=cfg.TEST.BBOX_AUG.ENABLED, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, c2d_json_path=cfg.MODEL.SEG_BRANCH.JSON_PATH, cfg=cfg, test_only=cfg.TEST_ONLY) synchronize() elif not cfg.NAS.SKIP_NAS_TEST: if get_rank() == 0: print('==' * 10, 'Start NAS testing', '==' * 10) timer = Timer() timer.tic() searcher = PathPrioritySearch(cfg, './nas_test') searcher.generate_fair_test( ) # load cache results and generate new model for test searcher.search(model, output_folders, dataset_names, distributed) searcher.save_topk() total_time = timer.toc() total_time_str = get_time_str(total_time) if get_rank() == 0: print('Finish NAS testing, total time:{}'.format(total_time_str)) return else: print('Skipping NAS testing...')
def main(): args = parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) output_dir = os.path.dirname(cfg.MODEL.WEIGHT) cfg.OUTPUT_DIR = output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) checkpointer = DetectronCheckpointer(cfg, model, save_dir=cfg.MODEL.WEIGHT) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) ## add for training parser.add_argument( "--data-dir", default="", metavar="DIR", help="path to data folder", type=str, ) parser.add_argument( "--pretrained-model", default="", help="path to pretrained model", metavar="FILE", type=str, ) parser.add_argument( "--nonlocal-cls-num-group", default="1", help="nonlocal num group cls", metavar="1", type=int, ) parser.add_argument( "--nonlocal-cls-num-stack", default="0", help="nonlocal num stack cls", metavar="1", type=int, ) parser.add_argument( "--nonlocal-reg-num-group", default="1", help="nonlocal num group reg", metavar="1", type=int, ) parser.add_argument( "--nonlocal-reg-num-stack", default="0", help="nonlocal num stack reg", metavar="1", type=int, ) parser.add_argument( "--nonlocal-shared-num-group", default="1", help="nonlocal num group reg", metavar="1", type=int, ) parser.add_argument( "--nonlocal-shared-num-stack", default="0", help="nonlocal num stack reg", metavar="1", type=int, ) parser.add_argument( "--nonlocal-out-channels", default="2048", help="nonlocal out channels for fpn, fpn=2048(like c4)", metavar="2048", type=int, ) parser.add_argument( "--nonlocal-inter-channels", default="256", help="nonlocal inter channels, c4 < 2048, fpn < 256", metavar="256", type=int, ) parser.add_argument( "--nonlocal-use-shared", default="True", help="nonlocal use shared non-locael", metavar="True", type=str, ) parser.add_argument( "--nonlocal-use-bn", default="True", help="nonlocal use bn after attention", metavar="True", type=str, ) parser.add_argument( "--nonlocal-use-softmax", default="False", help="nonlocal use softmax other than div", metavar="False", type=str, ) parser.add_argument( "--nonlocal-use-attention", default="True", help="nonlocal use attention before ffconv", metavar="True", type=str, ) parser.add_argument( "--nonlocal-use-ffconv", default="True", help="nonlocal use ffconv after nonlocal with residual", metavar="True", type=str, ) parser.add_argument( "--nonlocal-use-relu", default="True", help="nonlocal use relu after bn", metavar="True", type=str, ) parser.add_argument( "--conv-bbox-expand", default="1.0", help="box expand conv", metavar="1.0", type=float, ) parser.add_argument( "--fc-bbox-expand", default="1.0", help="box expand fc", metavar="1.0", type=float, ) parser.add_argument( "--backbone-out-channels", default="256", help="fpn out channels for fpn, fpn=2048(like c4)", metavar="256", type=int, ) parser.add_argument( "--evaluation-flags", nargs='*', # default=[0, 3], default=[], help="model code for evaluation flags", metavar="1 1 1 1", type=int, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.DATA_DIR = args.data_dir cfg.MODEL.WEIGHT = args.pretrained_model cfg.MODEL.ROI_BOX_HEAD.NEIGHBOR_CONV_EXPAND = args.conv_bbox_expand cfg.MODEL.ROI_BOX_HEAD.NEIGHBOR_FC_EXPAND = args.fc_bbox_expand cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_CLS_NUM_GROUP = args.nonlocal_cls_num_group cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_CLS_NUM_STACK = args.nonlocal_cls_num_stack cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_REG_NUM_GROUP = args.nonlocal_reg_num_group cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_REG_NUM_STACK = args.nonlocal_reg_num_stack cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_SHARED_NUM_GROUP = args.nonlocal_shared_num_group cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_SHARED_NUM_STACK = args.nonlocal_shared_num_stack cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_INTER_CHANNELS = args.nonlocal_inter_channels cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_OUT_CHANNELS = args.nonlocal_out_channels cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_USE_SHARED = ast.literal_eval( args.nonlocal_use_shared) cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_USE_BN = ast.literal_eval( args.nonlocal_use_bn) cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_USE_SOFTMAX = ast.literal_eval( args.nonlocal_use_softmax) cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_USE_FFCONV = ast.literal_eval( args.nonlocal_use_ffconv) cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_USE_RELU = ast.literal_eval( args.nonlocal_use_relu) cfg.MODEL.ROI_BOX_HEAD.NONLOCAL_USE_ATTENTION = ast.literal_eval( args.nonlocal_use_attention) cfg.MODEL.BACKBONE.OUT_CHANNELS = args.backbone_out_channels # double heads cfg.TEST.EVALUATION_FLAGS = args.evaluation_flags cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, None, None, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) # arguments.update(extra_checkpoint_data) checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD logger = logging.getLogger("maskrcnn_benchmark.trainer") if cfg.MODEL.META_ARCHITECTURE == 'AdaptionRCNN': logger.info('AdaptionRCNN trainer is adapted!') cross_do_train( cfg, model, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, distributed, ) elif cfg.MODEL.META_ARCHITECTURE == 'GeneralizedRCNN': logger.info('GeneralizedRCNN trainer is adapted!') data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) do_train( cfg, model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, distributed, ) return model
try: cfg.merge_from_file(config_file) except KeyError as e: print(e) cfg.INPUT.PIXEL_MEAN = [0, 0, 0] cfg.INPUT.HORIZONTAL_FLIP_PROB_TRAIN = 0.5 cfg.INPUT.VERTICAL_FLIP_PROB_TRAIN = 0.5 cfg.INPUT.ROTATE_PROB_TRAIN = 1.0 cfg.INPUT.ROTATE_DEGREES_TRAIN = (-45, 45) cfg.DATALOADER.NUM_WORKERS = 1 cfg.DATALOADER.SIZE_DIVISIBILITY = 0 cfg.freeze() data_loader = make_data_loader( cfg, is_train=True, is_distributed=False, start_iter=0, ) device = 'cpu' is_rotated = 1 #cfg.MODEL.ROTATED if is_rotated: from maskrcnn_benchmark.modeling.rrpn.utils import get_boxlist_rotated_rect_tensor from maskrcnn_benchmark.modeling.rrpn.anchor_generator import draw_anchors start_iter = 0 for iteration, (images, targets, _) in enumerate(data_loader, start_iter): img_tensors = images.tensors for id in range(len(targets)):
def run_test(cfg, model, distributed, iter, valid=False): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) if valid: output_folders = [None] * len(cfg.DATASETS.VALID) dataset_names = cfg.DATASETS.VALID if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join( cfg.OUTPUT_DIR, "validation", dataset_name, '{}_{}'.format(iter, cfg.MODEL.ROI_HEADS.SCORE_THRESH)) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, dataset='valid', is_distributed=distributed) print(distributed) results = [] for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): # TODO if multiple valid set, result will be a list result = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) results.append(result) return results else: output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join( cfg.OUTPUT_DIR, "inference", dataset_name, '{}_{}'.format(iter, cfg.MODEL.ROI_HEADS.SCORE_THRESH)) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_test = make_data_loader(cfg, dataset='test', is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_test): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def train(cfg, local_rank, distributed): model = build_detection_model(cfg) # 梦开始的地方 device = torch.device(cfg.MODEL.DEVICE) # !!!!! model.to(device) for name, value in model.backbone.body.network.named_children( ): # 冻结主干网络参数 if int(name) > 60: for param in value.parameters(): param.requires_grad = False optimizer = make_optimizer(cfg, model) scheduler = make_lr_scheduler(cfg, optimizer) # Initialize mixed-precision training use_mixed_precision = cfg.DTYPE == "float16" # 这里可以改成float16来加速 amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) arguments = {} arguments["iteration"] = 0 output_dir = cfg.OUTPUT_DIR save_to_disk = get_rank() == 0 checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) arguments.update(extra_checkpoint_data) data_loader = make_data_loader( cfg, is_train=True, is_distributed=distributed, start_iter=arguments["iteration"], ) test_period = cfg.SOLVER.TEST_PERIOD if test_period > 0: data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True) else: data_loader_val = None checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, ) return model
def do_train(model, data_loader, optimizer, scheduler, checkpointer, device, checkpoint_period, arguments, meters): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=False) output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", cfg.DATASETS.TEST[0]) mkdir(output_folder) start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): model.train() data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = [target.to(device) for target in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) # EVALUATION # inference( # model, # cfg, # data_loaders_val[0], # dataset_name=cfg.DATASETS.TEST, # device=cfg.MODEL.DEVICE, # meters=meters, # expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, # output_folder=output_folder, # ) if iteration == max_iter: checkpointer.save("model_final", **arguments) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.deprecated.init_process_group( backend="nccl", init_method="env://" ) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) init() tag = 17 set_epoch_tag(tag) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) is_fp16 = (cfg.DTYPE == "float16") if is_fp16: # convert model to FP16 model.half() output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) # evaluator = get_evaluator() start_test_time = time.time() results = [] for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): result = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) results.append(result) end_test_time = time.time() total_testing_time = end_test_time - start_test_time if is_main_process(): map_results, raw_results = results[0] bbox_map = map_results.results["bbox"]['AP'] segm_map = map_results.results["segm"]['AP'] print("BBOX_mAP: ", bbox_map, " MASK_mAP: ", segm_map) print("Inference time: ", total_testing_time)